/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/EHPersonalities.h"

31

#include "llvm/Analysis/ObjCARCUtil.h"

32

#include "llvm/Analysis/ProfileSummaryInfo.h"

33

#include "llvm/Analysis/VectorUtils.h"

34

#include "llvm/CodeGen/IntrinsicLowering.h"

35

#include "llvm/CodeGen/MachineFrameInfo.h"

36

#include "llvm/CodeGen/MachineFunction.h"

37

#include "llvm/CodeGen/MachineInstrBuilder.h"

38

#include "llvm/CodeGen/MachineJumpTableInfo.h"

39

#include "llvm/CodeGen/MachineLoopInfo.h"

40

#include "llvm/CodeGen/MachineModuleInfo.h"

41

#include "llvm/CodeGen/MachineRegisterInfo.h"

42

#include "llvm/CodeGen/TargetLowering.h"

43

#include "llvm/CodeGen/WinEHFuncInfo.h"

44

#include "llvm/IR/CallingConv.h"

45

#include "llvm/IR/Constants.h"

46

#include "llvm/IR/DerivedTypes.h"

47

#include "llvm/IR/DiagnosticInfo.h"

48

#include "llvm/IR/Function.h"

49

#include "llvm/IR/GlobalAlias.h"

50

#include "llvm/IR/GlobalVariable.h"

51

#include "llvm/IR/IRBuilder.h"

52

#include "llvm/IR/Instructions.h"

53

#include "llvm/IR/Intrinsics.h"

54

#include "llvm/IR/PatternMatch.h"

55

#include "llvm/MC/MCAsmInfo.h"

56

#include "llvm/MC/MCContext.h"

57

#include "llvm/MC/MCExpr.h"

58

#include "llvm/MC/MCSymbol.h"

59

#include "llvm/Support/CommandLine.h"

60

#include "llvm/Support/Debug.h"

61

#include "llvm/Support/ErrorHandling.h"

62

#include "llvm/Support/KnownBits.h"

63

#include "llvm/Support/MathExtras.h"

64

#include "llvm/Target/TargetOptions.h"

65

#include <algorithm>

66

#include <bitset>

67

#include <cctype>

68

#include <numeric>

69

using namespace llvm;

70

71

#define DEBUG_TYPE"x86-isel" "x86-isel"

72

73

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

74

75

static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

76

"x86-experimental-pref-innermost-loop-alignment", cl::init(4),

77

cl::desc(

78

"Sets the preferable loop alignment for experiments (as log2 bytes) "

79

"for innermost loops only. If specified, this option overrides "

80

"alignment set by x86-experimental-pref-loop-alignment."),

81

cl::Hidden);

82

83

static cl::opt<bool> MulConstantOptimization(

84

"mul-constant-optimization", cl::init(true),

85

cl::desc("Replace 'mul x, Const' with more effective instructions like "

86

"SHIFT, LEA, etc."),

87

cl::Hidden);

88

89

static cl::opt<bool> ExperimentalUnorderedISEL(

90

"x86-experimental-unordered-atomic-isel", cl::init(false),

91

cl::desc("Use LoadSDNode and StoreSDNode instead of "

92

"AtomicSDNode for unordered atomic loads and "

93

"stores respectively."),

94

cl::Hidden);

95

96

/// Call this when the user attempts to do something unsupported, like

97

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

98

/// report_fatal_error, so calling code should attempt to recover without

99

/// crashing.

100

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

101

const char *Msg) {

102

MachineFunction &MF = DAG.getMachineFunction();

103

DAG.getContext()->diagnose(

104

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

105

}

106

107

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

108

const X86Subtarget &STI)

109

: TargetLowering(TM), Subtarget(STI) {

110

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

111

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

112

113

// Set up the TargetLowering object.

114

115

// X86 is weird. It always uses i8 for shift amounts and setcc results.

116

setBooleanContents(ZeroOrOneBooleanContent);

117

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

118

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

119

120

// For 64-bit, since we have so many registers, use the ILP scheduler.

121

// For 32-bit, use the register pressure specific scheduling.

122

// For Atom, always use ILP scheduling.

123

if (Subtarget.isAtom())

124

setSchedulingPreference(Sched::ILP);

125

else if (Subtarget.is64Bit())

126

setSchedulingPreference(Sched::ILP);

127

else

128

setSchedulingPreference(Sched::RegPressure);

129

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

130

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

131

132

// Bypass expensive divides and use cheaper ones.

133

if (TM.getOptLevel() >= CodeGenOpt::Default) {

134

if (Subtarget.hasSlowDivide32())

135

addBypassSlowDiv(32, 8);

136

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

137

addBypassSlowDiv(64, 32);

138

}

139

140

// Setup Windows compiler runtime calls.

141

if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {

142

static const struct {

143

const RTLIB::Libcall Op;

144

const char * const Name;

145

const CallingConv::ID CC;

146

} LibraryCalls[] = {

147

{ RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },

148

{ RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },

149

{ RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },

150

{ RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },

151

{ RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },

152

};

153

154

for (const auto &LC : LibraryCalls) {

155

setLibcallName(LC.Op, LC.Name);

156

setLibcallCallingConv(LC.Op, LC.CC);

157

}

158

}

159

160

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

161

// MSVCRT doesn't have powi; fall back to pow

162

setLibcallName(RTLIB::POWI_F32, nullptr);

163

setLibcallName(RTLIB::POWI_F64, nullptr);

164

}

165

166

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

167

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

168

// FIXME: Should we be limiting the atomic size on other configs? Default is

169

// 1024.

170

if (!Subtarget.canUseCMPXCHG8B())

171

setMaxAtomicSizeInBitsSupported(32);

172

173

// Set up the register classes.

174

addRegisterClass(MVT::i8, &X86::GR8RegClass);

175

addRegisterClass(MVT::i16, &X86::GR16RegClass);

176

addRegisterClass(MVT::i32, &X86::GR32RegClass);

177

if (Subtarget.is64Bit())

178

addRegisterClass(MVT::i64, &X86::GR64RegClass);

179

180

for (MVT VT : MVT::integer_valuetypes())

181

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

182

183

// We don't accept any truncstore of integer registers.

184

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

185

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

186

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

187

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

188

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

189

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

190

191

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

192

193

// SETOEQ and SETUNE require checking two conditions.

194

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

195

setCondCodeAction(ISD::SETOEQ, VT, Expand);

196

setCondCodeAction(ISD::SETUNE, VT, Expand);

197

}

198

199

// Integer absolute.

200

if (Subtarget.canUseCMOV()) {

201

setOperationAction(ISD::ABS , MVT::i16 , Custom);

202

setOperationAction(ISD::ABS , MVT::i32 , Custom);

203

if (Subtarget.is64Bit())

204

setOperationAction(ISD::ABS , MVT::i64 , Custom);

205

}

206

207

// Signed saturation subtraction.

208

setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);

209

setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);

210

setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);

211

if (Subtarget.is64Bit())

212

setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);

213

214

// Funnel shifts.

215

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

216

// For slow shld targets we only lower for code size.

217

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

218

219

setOperationAction(ShiftOp , MVT::i8 , Custom);

220

setOperationAction(ShiftOp , MVT::i16 , Custom);

221

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

222

if (Subtarget.is64Bit())

223

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

224

}

225

226

if (!Subtarget.useSoftFloat()) {

227

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

228

// operation.

229

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

230

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

231

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

232

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

233

// We have an algorithm for SSE2, and we turn this into a 64-bit

234

// FILD or VCVTUSI2SS/SD for other targets.

235

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

236

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

237

// We have an algorithm for SSE2->double, and we turn this into a

238

// 64-bit FILD followed by conditional FADD for other targets.

239

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

240

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

241

242

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

243

// this operation.

244

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

245

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

246

// SSE has no i16 to fp conversion, only i32. We promote in the handler

247

// to allow f80 to use i16 and f64 to use i16 with sse1 only

248

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

249

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

250

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

251

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

252

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

253

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

254

// are Legal, f80 is custom lowered.

255

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

256

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

257

258

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

259

// this operation.

260

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

261

// FIXME: This doesn't generate invalid exception when it should. PR44019.

262

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

263

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

264

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

265

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

266

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

267

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

268

// are Legal, f80 is custom lowered.

269

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

270

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

271

272

// Handle FP_TO_UINT by promoting the destination to a larger signed

273

// conversion.

274

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

275

// FIXME: This doesn't generate invalid exception when it should. PR44019.

276

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

277

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

278

// FIXME: This doesn't generate invalid exception when it should. PR44019.

279

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

280

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

281

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

282

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

283

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

284

285

setOperationAction(ISD::LRINT, MVT::f32, Custom);

286

setOperationAction(ISD::LRINT, MVT::f64, Custom);

287

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

288

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

289

290

if (!Subtarget.is64Bit()) {

291

setOperationAction(ISD::LRINT, MVT::i64, Custom);

292

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

293

}

294

}

295

296

if (Subtarget.hasSSE2()) {

297

// Custom lowering for saturating float to int conversions.

298

// We handle promotion to larger result types manually.

299

for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

300

setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

301

setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

302

}

303

if (Subtarget.is64Bit()) {

304

setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

305

setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

306

}

307

}

308

309

// Handle address space casts between mixed sized pointers.

310

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

311

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

312

313

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

314

if (!Subtarget.hasSSE2()) {

315

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

316

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

317

if (Subtarget.is64Bit()) {

318

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

319

// Without SSE, i64->f64 goes through memory.

320

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

321

}

322

} else if (!Subtarget.is64Bit())

323

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

324

325

// Scalar integer divide and remainder are lowered to use operations that

326

// produce two results, to match the available instructions. This exposes

327

// the two-result form to trivial CSE, which is able to combine x/y and x%y

328

// into a single instruction.

329

//

330

// Scalar integer multiply-high is also lowered to use two-result

331

// operations, to match the available instructions. However, plain multiply

332

// (low) operations are left as Legal, as there are single-result

333

// instructions for this in x86. Using the two-result multiply instructions

334

// when both high and low results are needed must be arranged by dagcombine.

335

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

336

setOperationAction(ISD::MULHS, VT, Expand);

337

setOperationAction(ISD::MULHU, VT, Expand);

338

setOperationAction(ISD::SDIV, VT, Expand);

339

setOperationAction(ISD::UDIV, VT, Expand);

340

setOperationAction(ISD::SREM, VT, Expand);

341

setOperationAction(ISD::UREM, VT, Expand);

342

}

343

344

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

345

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

346

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

347

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

348

setOperationAction(ISD::BR_CC, VT, Expand);

349

setOperationAction(ISD::SELECT_CC, VT, Expand);

350

}

351

if (Subtarget.is64Bit())

352

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

353

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

354

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

355

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

356

357

setOperationAction(ISD::FREM , MVT::f32 , Expand);

358

setOperationAction(ISD::FREM , MVT::f64 , Expand);

359

setOperationAction(ISD::FREM , MVT::f80 , Expand);

360

setOperationAction(ISD::FREM , MVT::f128 , Expand);

361

362

if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

363

setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

364

setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);

365

}

366

367

// Promote the i8 variants and force them on up to i32 which has a shorter

368

// encoding.

369

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

370

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

371

372

if (Subtarget.hasBMI()) {

373

// Promote the i16 zero undef variant and force it on up to i32 when tzcnt

374

// is enabled.

375

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);

376

} else {

377

setOperationAction(ISD::CTTZ, MVT::i16, Custom);

378

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

379

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);

380

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

381

if (Subtarget.is64Bit()) {

382

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

383

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

384

}

385

}

386

387

if (Subtarget.hasLZCNT()) {

388

// When promoting the i8 variants, force them to i32 for a shorter

389

// encoding.

390

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

391

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

392

} else {

393

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

394

if (VT == MVT::i64 && !Subtarget.is64Bit())

395

continue;

396

setOperationAction(ISD::CTLZ , VT, Custom);

397

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

398

}

399

}

400

401

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

402

ISD::STRICT_FP_TO_FP16}) {

403

// Special handling for half-precision floating point conversions.

404

// If we don't have F16C support, then lower half float conversions

405

// into library calls.

406

setOperationAction(

407

Op, MVT::f32,

408

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

409

// There's never any support for operations beyond MVT::f32.

410

setOperationAction(Op, MVT::f64, Expand);

411

setOperationAction(Op, MVT::f80, Expand);

412

setOperationAction(Op, MVT::f128, Expand);

413

}

414

415

setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);

416

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);

417

setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);

418

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);

419

setTruncStoreAction(MVT::f32, MVT::f16, Expand);

420

setTruncStoreAction(MVT::f64, MVT::f16, Expand);

421

setTruncStoreAction(MVT::f80, MVT::f16, Expand);

422

setTruncStoreAction(MVT::f128, MVT::f16, Expand);

423

424

setOperationAction(ISD::PARITY, MVT::i8, Custom);

425

setOperationAction(ISD::PARITY, MVT::i16, Custom);

426

setOperationAction(ISD::PARITY, MVT::i32, Custom);

427

if (Subtarget.is64Bit())

428

setOperationAction(ISD::PARITY, MVT::i64, Custom);

429

if (Subtarget.hasPOPCNT()) {

430

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

431

// popcntw is longer to encode than popcntl and also has a false dependency

432

// on the dest that popcntl hasn't had since Cannon Lake.

433

setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

434

} else {

435

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

436

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

437

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

438

if (Subtarget.is64Bit())

439

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

440

else

441

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

442

}

443

444

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

445

446

if (!Subtarget.hasMOVBE())

447

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

448

449

// X86 wants to expand cmov itself.

450

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

451

setOperationAction(ISD::SELECT, VT, Custom);

452

setOperationAction(ISD::SETCC, VT, Custom);

453

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

454

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

455

}

456

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

457

if (VT == MVT::i64 && !Subtarget.is64Bit())

458

continue;

459

setOperationAction(ISD::SELECT, VT, Custom);

460

setOperationAction(ISD::SETCC, VT, Custom);

461

}

462

463

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

464

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

465

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

466

467

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

468

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

469

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

470

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

471

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

472

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

473

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

474

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

475

476

// Darwin ABI issue.

477

for (auto VT : { MVT::i32, MVT::i64 }) {

478

if (VT == MVT::i64 && !Subtarget.is64Bit())

479

continue;

480

setOperationAction(ISD::ConstantPool , VT, Custom);

481

setOperationAction(ISD::JumpTable , VT, Custom);

482

setOperationAction(ISD::GlobalAddress , VT, Custom);

483

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

484

setOperationAction(ISD::ExternalSymbol , VT, Custom);

485

setOperationAction(ISD::BlockAddress , VT, Custom);

486

}

487

488

// 64-bit shl, sra, srl (iff 32-bit x86)

489

for (auto VT : { MVT::i32, MVT::i64 }) {

490

if (VT == MVT::i64 && !Subtarget.is64Bit())

491

continue;

492

setOperationAction(ISD::SHL_PARTS, VT, Custom);

493

setOperationAction(ISD::SRA_PARTS, VT, Custom);

494

setOperationAction(ISD::SRL_PARTS, VT, Custom);

495

}

496

497

if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())

498

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

499

500

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

501

502

// Expand certain atomics

503

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

504

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

505

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

506

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

507

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

508

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

509

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

510

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

511

}

512

513

if (!Subtarget.is64Bit())

514

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

515

516

if (Subtarget.canUseCMPXCHG16B())

517

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

518

519

// FIXME - use subtarget debug flags

520

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

521

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

522

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

523

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

524

}

525

526

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

527

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

528

529

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

530

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

531

532

setOperationAction(ISD::TRAP, MVT::Other, Legal);

533

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

534

if (Subtarget.getTargetTriple().isPS4())

535

setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

536

else

537

setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);

538

539

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

540

setOperationAction(ISD::VASTART , MVT::Other, Custom);

541

setOperationAction(ISD::VAEND , MVT::Other, Expand);

542

bool Is64Bit = Subtarget.is64Bit();

543

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

544

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

545

546

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

547

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

548

549

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

550

551

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

552

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

553

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

554

555

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

556

// f32 and f64 use SSE.

557

// Set up the FP register classes.

558

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

559

: &X86::FR32RegClass);

560

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

561

: &X86::FR64RegClass);

562

563

// Disable f32->f64 extload as we can only generate this in one instruction

564

// under optsize. So its easier to pattern match (fpext (load)) for that

565

// case instead of needing to emit 2 instructions for extload in the

566

// non-optsize case.

567

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

568

569

for (auto VT : { MVT::f32, MVT::f64 }) {

570

// Use ANDPD to simulate FABS.

571

setOperationAction(ISD::FABS, VT, Custom);

572

573

// Use XORP to simulate FNEG.

574

setOperationAction(ISD::FNEG, VT, Custom);

575

576

// Use ANDPD and ORPD to simulate FCOPYSIGN.

577

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

578

579

// These might be better off as horizontal vector ops.

580

setOperationAction(ISD::FADD, VT, Custom);

581

setOperationAction(ISD::FSUB, VT, Custom);

582

583

// We don't support sin/cos/fmod

584

setOperationAction(ISD::FSIN , VT, Expand);

585

setOperationAction(ISD::FCOS , VT, Expand);

586

setOperationAction(ISD::FSINCOS, VT, Expand);

587

}

588

589

// Lower this to MOVMSK plus an AND.

590

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

591

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

592

593

} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

594

(UseX87 || Is64Bit)) {

595

// Use SSE for f32, x87 for f64.

596

// Set up the FP register classes.

597

addRegisterClass(MVT::f32, &X86::FR32RegClass);

598

if (UseX87)

599

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

600

601

// Use ANDPS to simulate FABS.

602

setOperationAction(ISD::FABS , MVT::f32, Custom);

603

604

// Use XORP to simulate FNEG.

605

setOperationAction(ISD::FNEG , MVT::f32, Custom);

606

607

if (UseX87)

608

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

609

610

// Use ANDPS and ORPS to simulate FCOPYSIGN.

611

if (UseX87)

612

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

613

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

614

615

// We don't support sin/cos/fmod

616

setOperationAction(ISD::FSIN , MVT::f32, Expand);

617

setOperationAction(ISD::FCOS , MVT::f32, Expand);

618

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

619

620

if (UseX87) {

621

// Always expand sin/cos functions even though x87 has an instruction.

622

setOperationAction(ISD::FSIN, MVT::f64, Expand);

623

setOperationAction(ISD::FCOS, MVT::f64, Expand);

624

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

625

}

626

} else if (UseX87) {

627

// f32 and f64 in x87.

628

// Set up the FP register classes.

629

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

630

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

631

632

for (auto VT : { MVT::f32, MVT::f64 }) {

633

setOperationAction(ISD::UNDEF, VT, Expand);

634

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

635

636

// Always expand sin/cos functions even though x87 has an instruction.

637

setOperationAction(ISD::FSIN , VT, Expand);

638

setOperationAction(ISD::FCOS , VT, Expand);

639

setOperationAction(ISD::FSINCOS, VT, Expand);

640

}

641

}

642

643

// Expand FP32 immediates into loads from the stack, save special cases.

644

if (isTypeLegal(MVT::f32)) {

645

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

646

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

647

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

648

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

649

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

650

} else // SSE immediates.

651

addLegalFPImmediate(APFloat(+0.0f)); // xorps

652

}

653

// Expand FP64 immediates into loads from the stack, save special cases.

654

if (isTypeLegal(MVT::f64)) {

655

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

656

addLegalFPImmediate(APFloat(+0.0)); // FLD0

657

addLegalFPImmediate(APFloat(+1.0)); // FLD1

658

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

659

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

660

} else // SSE immediates.

661

addLegalFPImmediate(APFloat(+0.0)); // xorpd

662

}

663

// Handle constrained floating-point operations of scalar.

664

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

665

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

666

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

667

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

668

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

669

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

670

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

671

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

672

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

673

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

674

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

675

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

676

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

677

678

// We don't support FMA.

679

setOperationAction(ISD::FMA, MVT::f64, Expand);

680

setOperationAction(ISD::FMA, MVT::f32, Expand);

681

682

// f80 always uses X87.

683

if (UseX87) {

684

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

685

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

686

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

687

{

688

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

689

addLegalFPImmediate(TmpFlt); // FLD0

690

TmpFlt.changeSign();

691

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

692

693

bool ignored;

694

APFloat TmpFlt2(+1.0);

695

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

696

&ignored);

697

addLegalFPImmediate(TmpFlt2); // FLD1

698

TmpFlt2.changeSign();

699

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

700

}

701

702

// Always expand sin/cos functions even though x87 has an instruction.

703

setOperationAction(ISD::FSIN , MVT::f80, Expand);

704

setOperationAction(ISD::FCOS , MVT::f80, Expand);

705

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

706

707

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

708

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

709

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

710

setOperationAction(ISD::FRINT, MVT::f80, Expand);

711

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

712

setOperationAction(ISD::FMA, MVT::f80, Expand);

713

setOperationAction(ISD::LROUND, MVT::f80, Expand);

714

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

715

setOperationAction(ISD::LRINT, MVT::f80, Custom);

716

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

717

718

// Handle constrained floating-point operations of scalar.

719

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

720

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

721

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

722

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

723

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

724

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

725

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

726

// as Custom.

727

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

728

}

729

730

// f128 uses xmm registers, but most operations require libcalls.

731

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

732

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

733

: &X86::VR128RegClass);

734

735

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

736

737

setOperationAction(ISD::FADD, MVT::f128, LibCall);

738

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

739

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

740

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

741

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

742

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

743

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

744

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

745

setOperationAction(ISD::FMA, MVT::f128, LibCall);

746

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

747

748

setOperationAction(ISD::FABS, MVT::f128, Custom);

749

setOperationAction(ISD::FNEG, MVT::f128, Custom);

750

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

751

752

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

753

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

754

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

755

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

756

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

757

// No STRICT_FSINCOS

758

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

759

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

760

761

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

762

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

763

// We need to custom handle any FP_ROUND with an f128 input, but

764

// LegalizeDAG uses the result type to know when to run a custom handler.

765

// So we have to list all legal floating point result types here.

766

if (isTypeLegal(MVT::f32)) {

767

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

768

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

769

}

770

if (isTypeLegal(MVT::f64)) {

771

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

772

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

773

}

774

if (isTypeLegal(MVT::f80)) {

775

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

776

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

777

}

778

779

setOperationAction(ISD::SETCC, MVT::f128, Custom);

780

781

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

782

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

783

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

784

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

785

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

786

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

787

}

788

789

// Always use a library call for pow.

790

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

791

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

792

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

793

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

794

795

setOperationAction(ISD::FLOG, MVT::f80, Expand);

796

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

797

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

798

setOperationAction(ISD::FEXP, MVT::f80, Expand);

799

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

800

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

801

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

802

803

// Some FP actions are always expanded for vector types.

804

for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

805

MVT::v4f32, MVT::v8f32, MVT::v16f32,

806

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

807

setOperationAction(ISD::FSIN, VT, Expand);

808

setOperationAction(ISD::FSINCOS, VT, Expand);

809

setOperationAction(ISD::FCOS, VT, Expand);

810

setOperationAction(ISD::FREM, VT, Expand);

811

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

812

setOperationAction(ISD::FPOW, VT, Expand);

813

setOperationAction(ISD::FLOG, VT, Expand);

814

setOperationAction(ISD::FLOG2, VT, Expand);

815

setOperationAction(ISD::FLOG10, VT, Expand);

816

setOperationAction(ISD::FEXP, VT, Expand);

817

setOperationAction(ISD::FEXP2, VT, Expand);

818

}

819

820

// First set operation action for all vector types to either promote

821

// (for widening) or expand (for scalarization). Then we will selectively

822

// turn on ones that can be effectively codegen'd.

823

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

824

setOperationAction(ISD::SDIV, VT, Expand);

825

setOperationAction(ISD::UDIV, VT, Expand);

826

setOperationAction(ISD::SREM, VT, Expand);

827

setOperationAction(ISD::UREM, VT, Expand);

828

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

829

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

830

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

831

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

832

setOperationAction(ISD::FMA, VT, Expand);

833

setOperationAction(ISD::FFLOOR, VT, Expand);

834

setOperationAction(ISD::FCEIL, VT, Expand);

835

setOperationAction(ISD::FTRUNC, VT, Expand);

836

setOperationAction(ISD::FRINT, VT, Expand);

837

setOperationAction(ISD::FNEARBYINT, VT, Expand);

838

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

839

setOperationAction(ISD::MULHS, VT, Expand);

840

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

841

setOperationAction(ISD::MULHU, VT, Expand);

842

setOperationAction(ISD::SDIVREM, VT, Expand);

843

setOperationAction(ISD::UDIVREM, VT, Expand);

844

setOperationAction(ISD::CTPOP, VT, Expand);

845

setOperationAction(ISD::CTTZ, VT, Expand);

846

setOperationAction(ISD::CTLZ, VT, Expand);

847

setOperationAction(ISD::ROTL, VT, Expand);

848

setOperationAction(ISD::ROTR, VT, Expand);

849

setOperationAction(ISD::BSWAP, VT, Expand);

850

setOperationAction(ISD::SETCC, VT, Expand);

851

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

852

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

853

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

854

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

855

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

856

setOperationAction(ISD::TRUNCATE, VT, Expand);

857

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

858

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

859

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

860

setOperationAction(ISD::SELECT_CC, VT, Expand);

861

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

862

setTruncStoreAction(InnerVT, VT, Expand);

863

864

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

865

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

866

867

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

868

// types, we have to deal with them whether we ask for Expansion or not.

869

// Setting Expand causes its own optimisation problems though, so leave

870

// them legal.

871

if (VT.getVectorElementType() == MVT::i1)

872

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

873

874

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

875

// split/scalarized right now.

876

if (VT.getVectorElementType() == MVT::f16)

877

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

878

}

879

}

880

881

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

882

// with -msoft-float, disable use of MMX as well.

883

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

884

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

885

// No operations on x86mmx supported, everything uses intrinsics.

886

}

887

888

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

889

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

890

: &X86::VR128RegClass);

891

892

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

893

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

894

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

895

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

896

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

897

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

898

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

899

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

900

901

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

902

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

903

904

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

905

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

906

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

907

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

908

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

909

}

910

911

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

912

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

913

: &X86::VR128RegClass);

914

915

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

916

// registers cannot be used even for integer operations.

917

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

918

: &X86::VR128RegClass);

919

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

920

: &X86::VR128RegClass);

921

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

922

: &X86::VR128RegClass);

923

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

924

: &X86::VR128RegClass);

925

926

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

927

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

928

setOperationAction(ISD::SDIV, VT, Custom);

929

setOperationAction(ISD::SREM, VT, Custom);

930

setOperationAction(ISD::UDIV, VT, Custom);

931

setOperationAction(ISD::UREM, VT, Custom);

932

}

933

934

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

935

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

936

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

937

938

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

939

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

940

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

941

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

942

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

943

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

944

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

945

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

946

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

947

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

948

setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);

949

setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);

950

951

setOperationAction(ISD::SMULO, MVT::v16i8, Custom);

952

setOperationAction(ISD::UMULO, MVT::v16i8, Custom);

953

954

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

955

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

956

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

957

958

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

959

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

960

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

961

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

962

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

963

}

964

965

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

966

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

967

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

968

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

969

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

970

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

971

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

972

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

973

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

974

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

975

976

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

977

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

978

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

979

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

980

981

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

982

setOperationAction(ISD::SETCC, VT, Custom);

983

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

984

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

985

setOperationAction(ISD::CTPOP, VT, Custom);

986

setOperationAction(ISD::ABS, VT, Custom);

987

988

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

989

// setcc all the way to isel and prefer SETGT in some isel patterns.

990

setCondCodeAction(ISD::SETLT, VT, Custom);

991

setCondCodeAction(ISD::SETLE, VT, Custom);

992

}

993

994

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

995

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

996

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

997

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

998

setOperationAction(ISD::VSELECT, VT, Custom);

999

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1000

}

1001

1002

for (auto VT : { MVT::v2f64, MVT::v2i64 }) {

1003

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1004

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1005

setOperationAction(ISD::VSELECT, VT, Custom);

1006

1007

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

1008

continue;

1009

1010

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1011

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1012

}

1013

1014

// Custom lower v2i64 and v2f64 selects.

1015

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1016

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1017

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

1018

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

1019

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

1020

1021

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);

1022

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);

1023

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

1024

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1025

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);

1026

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

1027

1028

// Custom legalize these to avoid over promotion or custom promotion.

1029

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

1030

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1031

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1032

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1033

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1034

}

1035

1036

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);

1037

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);

1038

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

1039

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

1040

1041

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

1042

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

1043

1044

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

1045

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

1046

1047

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

1048

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1049

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1050

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1051

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1052

1053

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1054

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1055

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1056

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1057

1058

// We want to legalize this to an f64 load rather than an i64 load on

1059

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1060

// store.

1061

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1062

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1063

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1064

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1065

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1066

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1067

1068

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1069

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1070

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1071

if (!Subtarget.hasAVX512())

1072

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1073

1074

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1075

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1076

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1077

1078

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1079

1080

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1081

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1082

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1083

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1084

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1085

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1086

1087

// In the customized shift lowering, the legal v4i32/v2i64 cases

1088

// in AVX2 will be recognized.

1089

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1090

setOperationAction(ISD::SRL, VT, Custom);

1091

setOperationAction(ISD::SHL, VT, Custom);

1092

setOperationAction(ISD::SRA, VT, Custom);

1093

if (VT == MVT::v2i64) continue;

1094

setOperationAction(ISD::ROTL, VT, Custom);

1095

setOperationAction(ISD::ROTR, VT, Custom);

1096

setOperationAction(ISD::FSHL, VT, Custom);

1097

setOperationAction(ISD::FSHR, VT, Custom);

1098

}

1099

1100

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1101

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1102

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1103

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1104

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1105

}

1106

1107

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1108

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1109

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1110

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1111

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1112

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1113

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1114

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1115

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1116

1117

// These might be better off as horizontal vector ops.

1118

setOperationAction(ISD::ADD, MVT::i16, Custom);

1119

setOperationAction(ISD::ADD, MVT::i32, Custom);

1120

setOperationAction(ISD::SUB, MVT::i16, Custom);

1121

setOperationAction(ISD::SUB, MVT::i32, Custom);

1122

}

1123

1124

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1125

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1126

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1127

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1128

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1129

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1130

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1131

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1132

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1133

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1134

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1135

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1136

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1137

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1138

1139

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1140

}

1141

1142

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1143

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1144

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1145

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1146

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1147

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1148

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1149

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1150

1151

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

1152

setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);

1153

setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);

1154

1155

// FIXME: Do we need to handle scalar-to-vector here?

1156

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1157

1158

// We directly match byte blends in the backend as they match the VSELECT

1159

// condition form.

1160

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1161

1162

// SSE41 brings specific instructions for doing vector sign extend even in

1163

// cases where we don't have SRA.

1164

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1165

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1166

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1167

}

1168

1169

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1170

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1171

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1172

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1173

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1174

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1175

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1176

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1177

}

1178

1179

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1180

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1181

// do the pre and post work in the vector domain.

1182

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1183

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1184

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1185

// so that DAG combine doesn't try to turn it into uint_to_fp.

1186

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1187

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1188

}

1189

}

1190

1191

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

1192

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

1193

}

1194

1195

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1196

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1197

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1198

setOperationAction(ISD::ROTL, VT, Custom);

1199

setOperationAction(ISD::ROTR, VT, Custom);

1200

}

1201

1202

// XOP can efficiently perform BITREVERSE with VPPERM.

1203

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1204

setOperationAction(ISD::BITREVERSE, VT, Custom);

1205

1206

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1207

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1208

setOperationAction(ISD::BITREVERSE, VT, Custom);

1209

}

1210

1211

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1212

bool HasInt256 = Subtarget.hasInt256();

1213

1214

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1215

: &X86::VR256RegClass);

1216

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1217

: &X86::VR256RegClass);

1218

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1219

: &X86::VR256RegClass);

1220

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1221

: &X86::VR256RegClass);

1222

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1223

: &X86::VR256RegClass);

1224

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1225

: &X86::VR256RegClass);

1226

1227

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1228

setOperationAction(ISD::FFLOOR, VT, Legal);

1229

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1230

setOperationAction(ISD::FCEIL, VT, Legal);

1231

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1232

setOperationAction(ISD::FTRUNC, VT, Legal);

1233

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1234

setOperationAction(ISD::FRINT, VT, Legal);

1235

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1236

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1237

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1238

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1239

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1240

1241

setOperationAction(ISD::FROUND, VT, Custom);

1242

1243

setOperationAction(ISD::FNEG, VT, Custom);

1244

setOperationAction(ISD::FABS, VT, Custom);

1245

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1246

}

1247

1248

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1249

// even though v8i16 is a legal type.

1250

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1251

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1252

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1253

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1254

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

1255

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);

1256

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);

1257

1258

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

1259

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

1260

1261

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1262

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1263

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1264

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1265

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1266

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1267

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1268

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1269

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1270

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

1271

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1272

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1273

1274

if (!Subtarget.hasAVX512())

1275

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1276

1277

// In the customized shift lowering, the legal v8i32/v4i64 cases

1278

// in AVX2 will be recognized.

1279

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1280

setOperationAction(ISD::SRL, VT, Custom);

1281

setOperationAction(ISD::SHL, VT, Custom);

1282

setOperationAction(ISD::SRA, VT, Custom);

1283

if (VT == MVT::v4i64) continue;

1284

setOperationAction(ISD::ROTL, VT, Custom);

1285

setOperationAction(ISD::ROTR, VT, Custom);

1286

setOperationAction(ISD::FSHL, VT, Custom);

1287

setOperationAction(ISD::FSHR, VT, Custom);

1288

}

1289

1290

// These types need custom splitting if their input is a 128-bit vector.

1291

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1292

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1293

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1294

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1295

1296

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1297

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1298

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1299

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1300

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1301

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1302

1303

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1304

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1305

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1306

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1307

}

1308

1309

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1310

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1311

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1312

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1313

1314

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1315

setOperationAction(ISD::SETCC, VT, Custom);

1316

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1317

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1318

setOperationAction(ISD::CTPOP, VT, Custom);

1319

setOperationAction(ISD::CTLZ, VT, Custom);

1320

1321

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1322

// setcc all the way to isel and prefer SETGT in some isel patterns.

1323

setCondCodeAction(ISD::SETLT, VT, Custom);

1324

setCondCodeAction(ISD::SETLE, VT, Custom);

1325

}

1326

1327

if (Subtarget.hasAnyFMA()) {

1328

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1329

MVT::v2f64, MVT::v4f64 }) {

1330

setOperationAction(ISD::FMA, VT, Legal);

1331

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1332

}

1333

}

1334

1335

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1336

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1337

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1338

}

1339

1340

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1341

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1342

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1343

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1344

1345

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1346

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1347

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1348

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1349

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1350

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1351

setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);

1352

setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);

1353

1354

setOperationAction(ISD::SMULO, MVT::v32i8, Custom);

1355

setOperationAction(ISD::UMULO, MVT::v32i8, Custom);

1356

1357

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1358

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1359

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1360

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1361

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1362

1363

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1364

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1365

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1366

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1367

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1368

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1369

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1370

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1371

setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);

1372

setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);

1373

setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);

1374

setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);

1375

1376

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1377

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1378

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1379

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1380

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1381

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1382

}

1383

1384

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1385

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1386

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1387

}

1388

1389

if (HasInt256) {

1390

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1391

// when we have a 256bit-wide blend with immediate.

1392

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1393

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1394

1395

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1396

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1397

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1398

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1399

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1400

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1401

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1402

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1403

}

1404

}

1405

1406

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1407

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1408

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1409

setOperationAction(ISD::MSTORE, VT, Legal);

1410

}

1411

1412

// Extract subvector is special because the value type

1413

// (result) is 128-bit but the source is 256-bit wide.

1414

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1415

MVT::v4f32, MVT::v2f64 }) {

1416

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1417

}

1418

1419

// Custom lower several nodes for 256-bit types.

1420

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1421

MVT::v8f32, MVT::v4f64 }) {

1422

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1423

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1424

setOperationAction(ISD::VSELECT, VT, Custom);

1425

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1426

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1427

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1428

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1429

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1430

setOperationAction(ISD::STORE, VT, Custom);

1431

}

1432

1433

if (HasInt256) {

1434

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1435

1436

// Custom legalize 2x32 to get a little better code.

1437

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1438

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1439

1440

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1441

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1442

setOperationAction(ISD::MGATHER, VT, Custom);

1443

}

1444

}

1445

1446

// This block controls legalization of the mask vector sizes that are

1447

// available with AVX512. 512-bit vectors are in a separate block controlled

1448

// by useAVX512Regs.

1449

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1450

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1451

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1452

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1453

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1454

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1455

1456

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1457

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1458

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1459

1460

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1461

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1462

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1463

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1464

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1465

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1466

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1467

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1468

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1469

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1470

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1471

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1472

1473

// There is no byte sized k-register load or store without AVX512DQ.

1474

if (!Subtarget.hasDQI()) {

1475

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1476

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1477

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1478

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1479

1480

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1481

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1482

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1483

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1484

}

1485

1486

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1487

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1488

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1489

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1490

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1491

}

1492

1493

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

1494

setOperationAction(ISD::VSELECT, VT, Expand);

1495

1496

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1497

setOperationAction(ISD::SETCC, VT, Custom);

1498

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1499

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1500

setOperationAction(ISD::SELECT, VT, Custom);

1501

setOperationAction(ISD::TRUNCATE, VT, Custom);

1502

1503

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1504

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1505

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1506

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1507

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1508

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1509

}

1510

1511

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1512

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1513

}

1514

1515

// This block controls legalization for 512-bit operations with 32/64 bit

1516

// elements. 512-bits can be disabled based on prefer-vector-width and

1517

// required-vector-width function attributes.

1518

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1519

bool HasBWI = Subtarget.hasBWI();

1520

1521

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1522

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1523

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1524

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1525

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1526

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1527

1528

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1529

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1530

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1531

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1532

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1533

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1534

if (HasBWI)

1535

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1536

}

1537

1538

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1539

setOperationAction(ISD::FNEG, VT, Custom);

1540

setOperationAction(ISD::FABS, VT, Custom);

1541

setOperationAction(ISD::FMA, VT, Legal);

1542

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1543

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1544

}

1545

1546

for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {

1547

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1548

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1549

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1550

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1551

}

1552

setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);

1553

setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);

1554

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);

1555

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);

1556

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);

1557

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

1558

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);

1559

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);

1560

1561

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1562

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1563

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1564

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1565

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1566

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1567

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1568

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1569

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1570

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1571

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

1572

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1573

1574

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1575

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1576

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1577

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1578

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1579

if (HasBWI)

1580

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1581

1582

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1583

// to 512-bit rather than use the AVX2 instructions so that we can use

1584

// k-masks.

1585

if (!Subtarget.hasVLX()) {

1586

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1587

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1588

setOperationAction(ISD::MLOAD, VT, Custom);

1589

setOperationAction(ISD::MSTORE, VT, Custom);

1590

}

1591

}

1592

1593

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1594

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1595

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1596

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1597

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1598

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1599

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1600

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1601

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1602

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1603

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1604

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1605

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1606

1607

if (HasBWI) {

1608

// Extends from v64i1 masks to 512-bit vectors.

1609

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1610

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1611

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1612

}

1613

1614

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1615

setOperationAction(ISD::FFLOOR, VT, Legal);

1616

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1617

setOperationAction(ISD::FCEIL, VT, Legal);

1618

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1619

setOperationAction(ISD::FTRUNC, VT, Legal);

1620

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1621

setOperationAction(ISD::FRINT, VT, Legal);

1622

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1623

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1624

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1625

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1626

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1627

1628

setOperationAction(ISD::FROUND, VT, Custom);

1629

}

1630

1631

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1632

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1633

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1634

}

1635

1636

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1637

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1638

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1639

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1640

1641

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1642

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1643

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1644

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1645

1646

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1647

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1648

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1649

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1650

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1651

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1652

setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

1653

setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);

1654

1655

setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

1656

setOperationAction(ISD::UMULO, MVT::v64i8, Custom);

1657

1658

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1659

1660

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1661

setOperationAction(ISD::SRL, VT, Custom);

1662

setOperationAction(ISD::SHL, VT, Custom);

1663

setOperationAction(ISD::SRA, VT, Custom);

1664

setOperationAction(ISD::ROTL, VT, Custom);

1665

setOperationAction(ISD::ROTR, VT, Custom);

1666

setOperationAction(ISD::SETCC, VT, Custom);

1667

1668

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1669

// setcc all the way to isel and prefer SETGT in some isel patterns.

1670

setCondCodeAction(ISD::SETLT, VT, Custom);

1671

setCondCodeAction(ISD::SETLE, VT, Custom);

1672

}

1673

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1674

setOperationAction(ISD::SMAX, VT, Legal);

1675

setOperationAction(ISD::UMAX, VT, Legal);

1676

setOperationAction(ISD::SMIN, VT, Legal);

1677

setOperationAction(ISD::UMIN, VT, Legal);

1678

setOperationAction(ISD::ABS, VT, Legal);

1679

setOperationAction(ISD::CTPOP, VT, Custom);

1680

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1681

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1682

}

1683

1684

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1685

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1686

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1687

setOperationAction(ISD::CTLZ, VT, Custom);

1688

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1689

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1690

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1691

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1692

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1693

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1694

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1695

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1696

}

1697

1698

setOperationAction(ISD::FSHL, MVT::v64i8, Custom);

1699

setOperationAction(ISD::FSHR, MVT::v64i8, Custom);

1700

setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

1701

setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

1702

setOperationAction(ISD::FSHL, MVT::v16i32, Custom);

1703

setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

1704

1705

if (Subtarget.hasDQI()) {

1706

setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);

1707

setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);

1708

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);

1709

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);

1710

setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);

1711

setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);

1712

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);

1713

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);

1714

1715

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1716

}

1717

1718

if (Subtarget.hasCDI()) {

1719

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1720

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1721

setOperationAction(ISD::CTLZ, VT, Legal);

1722

}

1723

} // Subtarget.hasCDI()

1724

1725

if (Subtarget.hasVPOPCNTDQ()) {

1726

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1727

setOperationAction(ISD::CTPOP, VT, Legal);

1728

}

1729

1730

// Extract subvector is special because the value type

1731

// (result) is 256-bit but the source is 512-bit wide.

1732

// 128-bit was made Legal under AVX1.

1733

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1734

MVT::v8f32, MVT::v4f64 })

1735

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1736

1737

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1738

MVT::v16f32, MVT::v8f64 }) {

1739

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1740

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1741

setOperationAction(ISD::SELECT, VT, Custom);

1742

setOperationAction(ISD::VSELECT, VT, Custom);

1743

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1744

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1745

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1746

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1747

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1748

}

1749

1750

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1751

setOperationAction(ISD::MLOAD, VT, Legal);

1752

setOperationAction(ISD::MSTORE, VT, Legal);

1753

setOperationAction(ISD::MGATHER, VT, Custom);

1754

setOperationAction(ISD::MSCATTER, VT, Custom);

1755

}

1756

if (HasBWI) {

1757

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1758

setOperationAction(ISD::MLOAD, VT, Legal);

1759

setOperationAction(ISD::MSTORE, VT, Legal);

1760

}

1761

} else {

1762

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1763

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1764

}

1765

1766

if (Subtarget.hasVBMI2()) {

1767

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1768

MVT::v16i16, MVT::v8i32, MVT::v4i64,

1769

MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1770

setOperationAction(ISD::FSHL, VT, Custom);

1771

setOperationAction(ISD::FSHR, VT, Custom);

1772

}

1773

1774

setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

1775

setOperationAction(ISD::ROTR, MVT::v8i16, Custom);

1776

setOperationAction(ISD::ROTR, MVT::v16i16, Custom);

1777

setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

1778

}

1779

}// useAVX512Regs

1780

1781

// This block controls legalization for operations that don't have

1782

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1783

// narrower widths.

1784

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1785

// These operations are handled on non-VLX by artificially widening in

1786

// isel patterns.

1787

1788

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,

1789

Subtarget.hasVLX() ? Legal : Custom);

1790

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,

1791

Subtarget.hasVLX() ? Legal : Custom);

1792

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,

1793

Subtarget.hasVLX() ? Legal : Custom);

1794

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,

1795

Subtarget.hasVLX() ? Legal : Custom);

1796

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1797

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,

1798

Subtarget.hasVLX() ? Legal : Custom);

1799

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,

1800

Subtarget.hasVLX() ? Legal : Custom);

1801

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,

1802

Subtarget.hasVLX() ? Legal : Custom);

1803

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,

1804

Subtarget.hasVLX() ? Legal : Custom);

1805

1806

if (Subtarget.hasDQI()) {

1807

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1808

// v2f32 UINT_TO_FP is already custom under SSE2.

1809

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))

1810

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__))

1811

"Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1811, __extension__
__PRETTY_FUNCTION__));

1812

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1813

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1814

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1815

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1816

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1817

}

1818

1819

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1820

setOperationAction(ISD::SMAX, VT, Legal);

1821

setOperationAction(ISD::UMAX, VT, Legal);

1822

setOperationAction(ISD::SMIN, VT, Legal);

1823

setOperationAction(ISD::UMIN, VT, Legal);

1824

setOperationAction(ISD::ABS, VT, Legal);

1825

}

1826

1827

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1828

setOperationAction(ISD::ROTL, VT, Custom);

1829

setOperationAction(ISD::ROTR, VT, Custom);

1830

}

1831

1832

// Custom legalize 2x32 to get a little better code.

1833

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

1834

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

1835

1836

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1837

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1838

setOperationAction(ISD::MSCATTER, VT, Custom);

1839

1840

if (Subtarget.hasDQI()) {

1841

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1842

setOperationAction(ISD::SINT_TO_FP, VT,

1843

Subtarget.hasVLX() ? Legal : Custom);

1844

setOperationAction(ISD::UINT_TO_FP, VT,

1845

Subtarget.hasVLX() ? Legal : Custom);

1846

setOperationAction(ISD::STRICT_SINT_TO_FP, VT,

1847

Subtarget.hasVLX() ? Legal : Custom);

1848

setOperationAction(ISD::STRICT_UINT_TO_FP, VT,

1849

Subtarget.hasVLX() ? Legal : Custom);

1850

setOperationAction(ISD::FP_TO_SINT, VT,

1851

Subtarget.hasVLX() ? Legal : Custom);

1852

setOperationAction(ISD::FP_TO_UINT, VT,

1853

Subtarget.hasVLX() ? Legal : Custom);

1854

setOperationAction(ISD::STRICT_FP_TO_SINT, VT,

1855

Subtarget.hasVLX() ? Legal : Custom);

1856

setOperationAction(ISD::STRICT_FP_TO_UINT, VT,

1857

Subtarget.hasVLX() ? Legal : Custom);

1858

setOperationAction(ISD::MUL, VT, Legal);

1859

}

1860

}

1861

1862

if (Subtarget.hasCDI()) {

1863

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1864

setOperationAction(ISD::CTLZ, VT, Legal);

1865

}

1866

} // Subtarget.hasCDI()

1867

1868

if (Subtarget.hasVPOPCNTDQ()) {

1869

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

1870

setOperationAction(ISD::CTPOP, VT, Legal);

1871

}

1872

}

1873

1874

// This block control legalization of v32i1/v64i1 which are available with

1875

// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with

1876

// useBWIRegs.

1877

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

1878

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

1879

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

1880

1881

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

1882

setOperationAction(ISD::VSELECT, VT, Expand);

1883

setOperationAction(ISD::TRUNCATE, VT, Custom);

1884

setOperationAction(ISD::SETCC, VT, Custom);

1885

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1886

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1887

setOperationAction(ISD::SELECT, VT, Custom);

1888

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1889

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1890

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1891

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1892

}

1893

1894

for (auto VT : { MVT::v16i1, MVT::v32i1 })

1895

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1896

1897

// Extends from v32i1 masks to 256-bit vectors.

1898

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

1899

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

1900

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

1901

1902

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

1903

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1904

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

1905

}

1906

1907

// These operations are handled on non-VLX by artificially widening in

1908

// isel patterns.

1909

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

1910

1911

if (Subtarget.hasBITALG()) {

1912

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

1913

setOperationAction(ISD::CTPOP, VT, Legal);

1914

}

1915

}

1916

1917

if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

1918

auto setGroup = [&] (MVT VT) {

1919

setOperationAction(ISD::FADD, VT, Legal);

1920

setOperationAction(ISD::STRICT_FADD, VT, Legal);

1921

setOperationAction(ISD::FSUB, VT, Legal);

1922

setOperationAction(ISD::STRICT_FSUB, VT, Legal);

1923

setOperationAction(ISD::FMUL, VT, Legal);

1924

setOperationAction(ISD::STRICT_FMUL, VT, Legal);

1925

setOperationAction(ISD::FDIV, VT, Legal);

1926

setOperationAction(ISD::STRICT_FDIV, VT, Legal);

1927

setOperationAction(ISD::FSQRT, VT, Legal);

1928

setOperationAction(ISD::STRICT_FSQRT, VT, Legal);

1929

1930

setOperationAction(ISD::FFLOOR, VT, Legal);

1931

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1932

setOperationAction(ISD::FCEIL, VT, Legal);

1933

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1934

setOperationAction(ISD::FTRUNC, VT, Legal);

1935

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1936

setOperationAction(ISD::FRINT, VT, Legal);

1937

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1938

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1939

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1940

1941

setOperationAction(ISD::LOAD, VT, Legal);

1942

setOperationAction(ISD::STORE, VT, Legal);

1943

1944

setOperationAction(ISD::FMA, VT, Legal);

1945

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1946

setOperationAction(ISD::VSELECT, VT, Legal);

1947

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1948

setOperationAction(ISD::SELECT, VT, Custom);

1949

1950

setOperationAction(ISD::FNEG, VT, Custom);

1951

setOperationAction(ISD::FABS, VT, Custom);

1952

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1953

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1954

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1955

};

1956

1957

// AVX512_FP16 scalar operations

1958

setGroup(MVT::f16);

1959

addRegisterClass(MVT::f16, &X86::FR16XRegClass);

1960

setOperationAction(ISD::FREM, MVT::f16, Promote);

1961

setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);

1962

setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);

1963

setOperationAction(ISD::BR_CC, MVT::f16, Expand);

1964

setOperationAction(ISD::SETCC, MVT::f16, Custom);

1965

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

1966

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

1967

setOperationAction(ISD::FROUND, MVT::f16, Custom);

1968

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

1969

setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);

1970

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);

1971

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

1972

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

1973

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

1974

if (isTypeLegal(MVT::f80)) {

1975

setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

1976

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

1977

}

1978

1979

setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

1980

setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);

1981

1982

if (Subtarget.useAVX512Regs()) {

1983

setGroup(MVT::v32f16);

1984

addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

1985

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);

1986

setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);

1987

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);

1988

setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);

1989

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);

1990

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);

1991

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

1992

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);

1993

1994

setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);

1995

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);

1996

setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);

1997

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);

1998

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);

1999

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

2000

MVT::v32i16);

2001

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);

2002

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

2003

MVT::v32i16);

2004

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);

2005

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

2006

MVT::v32i16);

2007

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);

2008

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

2009

MVT::v32i16);

2010

2011

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);

2012

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);

2013

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);

2014

2015

setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);

2016

setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);

2017

2018

setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);

2019

setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);

2020

}

2021

2022

if (Subtarget.hasVLX()) {

2023

addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);

2024

addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);

2025

setGroup(MVT::v8f16);

2026

setGroup(MVT::v16f16);

2027

2028

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);

2029

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);

2030

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);

2031

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);

2032

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);

2033

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);

2034

setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);

2035

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);

2036

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);

2037

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);

2038

2039

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

2040

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);

2041

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);

2042

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);

2043

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);

2044

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

2045

2046

// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

2047

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);

2048

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);

2049

2050

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);

2051

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);

2052

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);

2053

2054

setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

2055

setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

2056

setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

2057

setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);

2058

2059

// Need to custom widen these to prevent scalarization.

2060

setOperationAction(ISD::LOAD, MVT::v4f16, Custom);

2061

setOperationAction(ISD::STORE, MVT::v4f16, Custom);

2062

}

2063

2064

// Support fp16 0 immediate

2065

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));

2066

}

2067

2068

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

2069

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

2070

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

2071

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

2072

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

2073

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

2074

2075

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

2076

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

2077

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

2078

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

2079

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

2080

2081

if (Subtarget.hasBWI()) {

2082

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

2083

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

2084

}

2085

2086

if (Subtarget.hasFP16()) {

2087

// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

2088

setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);

2089

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

2090

setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);

2091

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

2092

setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);

2093

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

2094

setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);

2095

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

2096

// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

2097

setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);

2098

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

2099

setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);

2100

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

2101

setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);

2102

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

2103

setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);

2104

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

2105

// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

2106

setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);

2107

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);

2108

setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

2109

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);

2110

// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

2111

setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);

2112

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);

2113

setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);

2114

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);

2115

}

2116

2117

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

2118

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

2119

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

2120

}

2121

2122

if (Subtarget.hasAMXTILE()) {

2123

addRegisterClass(MVT::x86amx, &X86::TILERegClass);

2124

}

2125

2126

// We want to custom lower some of our intrinsics.

2127

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

2128

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

2129

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

2130

if (!Subtarget.is64Bit()) {

2131

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

2132

}

2133

2134

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

2135

// handle type legalization for these operations here.

2136

//

2137

// FIXME: We really should do custom legalization for addition and

2138

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

2139

// than generic legalization for 64-bit multiplication-with-overflow, though.

2140

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

2141

if (VT == MVT::i64 && !Subtarget.is64Bit())

2142

continue;

2143

// Add/Sub/Mul with overflow operations are custom lowered.

2144

setOperationAction(ISD::SADDO, VT, Custom);

2145

setOperationAction(ISD::UADDO, VT, Custom);

2146

setOperationAction(ISD::SSUBO, VT, Custom);

2147

setOperationAction(ISD::USUBO, VT, Custom);

2148

setOperationAction(ISD::SMULO, VT, Custom);

2149

setOperationAction(ISD::UMULO, VT, Custom);

2150

2151

// Support carry in as value rather than glue.

2152

setOperationAction(ISD::ADDCARRY, VT, Custom);

2153

setOperationAction(ISD::SUBCARRY, VT, Custom);

2154

setOperationAction(ISD::SETCCCARRY, VT, Custom);

2155

setOperationAction(ISD::SADDO_CARRY, VT, Custom);

2156

setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

2157

}

2158

2159

if (!Subtarget.is64Bit()) {

2160

// These libcalls are not available in 32-bit.

2161

setLibcallName(RTLIB::SHL_I128, nullptr);

2162

setLibcallName(RTLIB::SRL_I128, nullptr);

2163

setLibcallName(RTLIB::SRA_I128, nullptr);

2164

setLibcallName(RTLIB::MUL_I128, nullptr);

2165

// The MULO libcall is not part of libgcc, only compiler-rt.

2166

setLibcallName(RTLIB::MULO_I64, nullptr);

2167

}

2168

// The MULO libcall is not part of libgcc, only compiler-rt.

2169

setLibcallName(RTLIB::MULO_I128, nullptr);

2170

2171

// Combine sin / cos into _sincos_stret if it is available.

2172

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

2173

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

2174

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

2175

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

2176

}

2177

2178

if (Subtarget.isTargetWin64()) {

2179

setOperationAction(ISD::SDIV, MVT::i128, Custom);

2180

setOperationAction(ISD::UDIV, MVT::i128, Custom);

2181

setOperationAction(ISD::SREM, MVT::i128, Custom);

2182

setOperationAction(ISD::UREM, MVT::i128, Custom);

2183

setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

2184

setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

2185

setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

2186

setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

2187

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

2188

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

2189

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

2190

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

2191

}

2192

2193

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

2194

// is. We should promote the value to 64-bits to solve this.

2195

// This is what the CRT headers do - `fmodf` is an inline header

2196

// function casting to f64 and calling `fmod`.

2197

if (Subtarget.is32Bit() &&

2198

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

2199

for (ISD::NodeType Op :

2200

{ISD::FCEIL, ISD::STRICT_FCEIL,

2201

ISD::FCOS, ISD::STRICT_FCOS,

2202

ISD::FEXP, ISD::STRICT_FEXP,

2203

ISD::FFLOOR, ISD::STRICT_FFLOOR,

2204

ISD::FREM, ISD::STRICT_FREM,

2205

ISD::FLOG, ISD::STRICT_FLOG,

2206

ISD::FLOG10, ISD::STRICT_FLOG10,

2207

ISD::FPOW, ISD::STRICT_FPOW,

2208

ISD::FSIN, ISD::STRICT_FSIN})

2209

if (isOperationExpand(Op, MVT::f32))

2210

setOperationAction(Op, MVT::f32, Promote);

2211

2212

// We have target-specific dag combine patterns for the following nodes:

2213

setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

2214

ISD::SCALAR_TO_VECTOR,

2215

ISD::INSERT_VECTOR_ELT,

2216

ISD::EXTRACT_VECTOR_ELT,

2217

ISD::CONCAT_VECTORS,

2218

ISD::INSERT_SUBVECTOR,

2219

ISD::EXTRACT_SUBVECTOR,

2220

ISD::BITCAST,

2221

ISD::VSELECT,

2222

ISD::SELECT,

2223

ISD::SHL,

2224

ISD::SRA,

2225

ISD::SRL,

2226

ISD::OR,

2227

ISD::AND,

2228

ISD::ADD,

2229

ISD::FADD,

2230

ISD::FSUB,

2231

ISD::FNEG,

2232

ISD::FMA,

2233

ISD::STRICT_FMA,

2234

ISD::FMINNUM,

2235

ISD::FMAXNUM,

2236

ISD::SUB,

2237

ISD::LOAD,

2238

ISD::MLOAD,

2239

ISD::STORE,

2240

ISD::MSTORE,

2241

ISD::TRUNCATE,

2242

ISD::ZERO_EXTEND,

2243

ISD::ANY_EXTEND,

2244

ISD::SIGN_EXTEND,

2245

ISD::SIGN_EXTEND_INREG,

2246

ISD::ANY_EXTEND_VECTOR_INREG,

2247

ISD::SIGN_EXTEND_VECTOR_INREG,

2248

ISD::ZERO_EXTEND_VECTOR_INREG,

2249

ISD::SINT_TO_FP,

2250

ISD::UINT_TO_FP,

2251

ISD::STRICT_SINT_TO_FP,

2252

ISD::STRICT_UINT_TO_FP,

2253

ISD::SETCC,

2254

ISD::MUL,

2255

ISD::XOR,

2256

ISD::MSCATTER,

2257

ISD::MGATHER,

2258

ISD::FP16_TO_FP,

2259

ISD::FP_EXTEND,

2260

ISD::STRICT_FP_EXTEND,

2261

ISD::FP_ROUND});

2262

2263

computeRegisterProperties(Subtarget.getRegisterInfo());

2264

2265

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2266

MaxStoresPerMemsetOptSize = 8;

2267

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2268

MaxStoresPerMemcpyOptSize = 4;

2269

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2270

MaxStoresPerMemmoveOptSize = 4;

2271

2272

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2273

// that needs to benchmarked and balanced with the potential use of vector

2274

// load/store types (PR33329, PR33914).

2275

MaxLoadsPerMemcmp = 2;

2276

MaxLoadsPerMemcmpOptSize = 2;

2277

2278

// Default loop alignment, which can be overridden by -align-loops.

2279

setPrefLoopAlignment(Align(16));

2280

2281

// An out-of-order CPU can speculatively execute past a predictable branch,

2282

// but a conditional move could be stalled by an expensive earlier operation.

2283

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2284

EnableExtLdPromotion = true;

2285

setPrefFunctionAlignment(Align(16));

2286

2287

verifyIntrinsicTables();

2288

2289

// Default to having -disable-strictnode-mutation on

2290

IsStrictFPEnabled = true;

2291

}

2292

2293

// This has so far only been implemented for 64-bit MachO.

2294

bool X86TargetLowering::useLoadStackGuardNode() const {

2295

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2296

}

2297

2298

bool X86TargetLowering::useStackGuardXorFP() const {

2299

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2300

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2301

}

2302

2303

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2304

const SDLoc &DL) const {

2305

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2306

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2307

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2308

return SDValue(Node, 0);

2309

}

2310

2311

TargetLoweringBase::LegalizeTypeAction

2312

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2313

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2314

!Subtarget.hasBWI())

2315

return TypeSplitVector;

2316

2317

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2318

VT.getVectorElementType() != MVT::i1)

2319

return TypeWidenVector;

2320

2321

return TargetLoweringBase::getPreferredVectorAction(VT);

2322

}

2323

2324

static std::pair<MVT, unsigned>

2325

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2326

const X86Subtarget &Subtarget) {

2327

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2328

// convention is one that uses k registers.

2329

if (NumElts == 2)

2330

return {MVT::v2i64, 1};

2331

if (NumElts == 4)

2332

return {MVT::v4i32, 1};

2333

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2334

CC != CallingConv::Intel_OCL_BI)

2335

return {MVT::v8i16, 1};

2336

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2337

CC != CallingConv::Intel_OCL_BI)

2338

return {MVT::v16i8, 1};

2339

// v32i1 passes in ymm unless we have BWI and the calling convention is

2340

// regcall.

2341

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2342

return {MVT::v32i8, 1};

2343

// Split v64i1 vectors if we don't have v64i8 available.

2344

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2345

if (Subtarget.useAVX512Regs())

2346

return {MVT::v64i8, 1};

2347

return {MVT::v32i8, 2};

2348

}

2349

2350

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2351

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2352

NumElts > 64)

2353

return {MVT::i8, NumElts};

2354

2355

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2356

}

2357

2358

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2359

CallingConv::ID CC,

2360

EVT VT) const {

2361

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2362

Subtarget.hasAVX512()) {

2363

unsigned NumElts = VT.getVectorNumElements();

2364

2365

MVT RegisterVT;

2366

unsigned NumRegisters;

2367

std::tie(RegisterVT, NumRegisters) =

2368

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2369

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2370

return RegisterVT;

2371

}

2372

2373

// v3f16 will be widen to v4f16. But we don't assign register class for v4f16.

2374

// So its default register type is f16. We override the type to v8f16 here.

2375

if (VT == MVT::v3f16 && Subtarget.hasFP16())

2376

return MVT::v8f16;

2377

2378

// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.

2379

if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&

2380

!Subtarget.hasX87())

2381

return MVT::i32;

2382

2383

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2384

}

2385

2386

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2387

CallingConv::ID CC,

2388

EVT VT) const {

2389

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2390

Subtarget.hasAVX512()) {

2391

unsigned NumElts = VT.getVectorNumElements();

2392

2393

MVT RegisterVT;

2394

unsigned NumRegisters;

2395

std::tie(RegisterVT, NumRegisters) =

2396

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2397

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2398

return NumRegisters;

2399

}

2400

2401

// v3f16 will be widen to v4f16. But we don't assign register class for v4f16.

2402

// So its default register number is 3. We override the number to 1 here.

2403

if (VT == MVT::v3f16 && Subtarget.hasFP16())

2404

return 1;

2405

2406

// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if

2407

// x87 is disabled.

2408

if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {

2409

if (VT == MVT::f64)

2410

return 2;

2411

if (VT == MVT::f80)

2412

return 3;

2413

}

2414

2415

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2416

}

2417

2418

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2419

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2420

unsigned &NumIntermediates, MVT &RegisterVT) const {

2421

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2422

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2423

Subtarget.hasAVX512() &&

2424

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2425

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2426

VT.getVectorNumElements() > 64)) {

2427

RegisterVT = MVT::i8;

2428

IntermediateVT = MVT::i1;

2429

NumIntermediates = VT.getVectorNumElements();

2430

return NumIntermediates;

2431

}

2432

2433

// Split v64i1 vectors if we don't have v64i8 available.

2434

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2435

CC != CallingConv::X86_RegCall) {

2436

RegisterVT = MVT::v32i8;

2437

IntermediateVT = MVT::v32i1;

2438

NumIntermediates = 2;

2439

return 2;

2440

}

2441

2442

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2443

NumIntermediates, RegisterVT);

2444

}

2445

2446

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2447

LLVMContext& Context,

2448

EVT VT) const {

2449

if (!VT.isVector())

2450

return MVT::i8;

2451

2452

if (Subtarget.hasAVX512()) {

2453

// Figure out what this type will be legalized to.

2454

EVT LegalVT = VT;

2455

while (getTypeAction(Context, LegalVT) != TypeLegal)

2456

LegalVT = getTypeToTransformTo(Context, LegalVT);

2457

2458

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2459

if (LegalVT.getSimpleVT().is512BitVector())

2460

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2461

2462

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2463

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2464

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2465

// vXi16/vXi8.

2466

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2467

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2468

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2469

}

2470

}

2471

2472

return VT.changeVectorElementTypeToInteger();

2473

}

2474

2475

/// Helper for getByValTypeAlignment to determine

2476

/// the desired ByVal argument alignment.

2477

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2478

if (MaxAlign == 16)

2479

return;

2480

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2481

if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)

2482

MaxAlign = Align(16);

2483

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2484

Align EltAlign;

2485

getMaxByValAlign(ATy->getElementType(), EltAlign);

2486

if (EltAlign > MaxAlign)

2487

MaxAlign = EltAlign;

2488

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2489

for (auto *EltTy : STy->elements()) {

2490

Align EltAlign;

2491

getMaxByValAlign(EltTy, EltAlign);

2492

if (EltAlign > MaxAlign)

2493

MaxAlign = EltAlign;

2494

if (MaxAlign == 16)

2495

break;

2496

}

2497

}

2498

}

2499

2500

/// Return the desired alignment for ByVal aggregate

2501

/// function arguments in the caller parameter area. For X86, aggregates

2502

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2503

/// are at 4-byte boundaries.

2504

uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,

2505

const DataLayout &DL) const {

2506

if (Subtarget.is64Bit()) {

2507

// Max of 8 and alignment of type.

2508

Align TyAlign = DL.getABITypeAlign(Ty);

2509

if (TyAlign > 8)

2510

return TyAlign.value();

2511

return 8;

2512

}

2513

2514

Align Alignment(4);

2515

if (Subtarget.hasSSE1())

2516

getMaxByValAlign(Ty, Alignment);

2517

return Alignment.value();

2518

}

2519

2520

/// It returns EVT::Other if the type should be determined using generic

2521

/// target-independent logic.

2522

/// For vector ops we check that the overall size isn't larger than our

2523

/// preferred vector width.

2524

EVT X86TargetLowering::getOptimalMemOpType(

2525

const MemOp &Op, const AttributeList &FuncAttributes) const {

2526

if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {

2527

if (Op.size() >= 16 &&

2528

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2529

// FIXME: Check if unaligned 64-byte accesses are slow.

2530

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2531

(Subtarget.getPreferVectorWidth() >= 512)) {

2532

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2533

}

2534

// FIXME: Check if unaligned 32-byte accesses are slow.

2535

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2536

(Subtarget.getPreferVectorWidth() >= 256)) {

2537

// Although this isn't a well-supported type for AVX1, we'll let

2538

// legalization and shuffle lowering produce the optimal codegen. If we

2539

// choose an optimal type with a vector element larger than a byte,

2540

// getMemsetStores() may create an intermediate splat (using an integer

2541

// multiply) before we splat as a vector.

2542

return MVT::v32i8;

2543

}

2544

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2545

return MVT::v16i8;

2546

// TODO: Can SSE1 handle a byte vector?

2547

// If we have SSE1 registers we should be able to use them.

2548

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2549

(Subtarget.getPreferVectorWidth() >= 128))

2550

return MVT::v4f32;

2551

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2552

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2553

// Do not use f64 to lower memcpy if source is string constant. It's

2554

// better to use i32 to avoid the loads.

2555

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2556

// The gymnastics of splatting a byte value into an XMM register and then

2557

// only using 8-byte stores (because this is a CPU with slow unaligned

2558

// 16-byte accesses) makes that a loser.

2559

return MVT::f64;

2560

}

2561

}

2562

// This is a compromise. If we reach here, unaligned accesses may be slow on

2563

// this target. However, creating smaller, aligned accesses could be even

2564

// slower and would certainly be a lot more code.

2565

if (Subtarget.is64Bit() && Op.size() >= 8)

2566

return MVT::i64;

2567

return MVT::i32;

2568

}

2569

2570

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2571

if (VT == MVT::f32)

2572

return Subtarget.hasSSE1();

2573

if (VT == MVT::f64)

2574

return Subtarget.hasSSE2();

2575

return true;

2576

}

2577

2578

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2579

EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,

2580

bool *Fast) const {

2581

if (Fast) {

2582

switch (VT.getSizeInBits()) {

2583

default:

2584

// 8-byte and under are always assumed to be fast.

2585

*Fast = true;

2586

break;

2587

case 128:

2588

*Fast = !Subtarget.isUnalignedMem16Slow();

2589

break;

2590

case 256:

2591

*Fast = !Subtarget.isUnalignedMem32Slow();

2592

break;

2593

// TODO: What about AVX-512 (512-bit) accesses?

2594

}

2595

}

2596

// NonTemporal vector memory ops must be aligned.

2597

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2598

// NT loads can only be vector aligned, so if its less aligned than the

2599

// minimum vector size (which we can split the vector down to), we might as

2600

// well use a regular unaligned vector load.

2601

// We don't have any NT loads pre-SSE41.

2602

if (!!(Flags & MachineMemOperand::MOLoad))

2603

return (Alignment < 16 || !Subtarget.hasSSE41());

2604

return false;

2605

}

2606

// Misaligned accesses of any size are always allowed.

2607

return true;

2608

}

2609

2610

/// Return the entry encoding for a jump table in the

2611

/// current function. The returned value is a member of the

2612

/// MachineJumpTableInfo::JTEntryKind enum.

2613

unsigned X86TargetLowering::getJumpTableEncoding() const {

2614

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2615

// symbol.

2616

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2617

return MachineJumpTableInfo::EK_Custom32;

2618

2619

// Otherwise, use the normal jump table encoding heuristics.

2620

return TargetLowering::getJumpTableEncoding();

2621

}

2622

2623

bool X86TargetLowering::useSoftFloat() const {

2624

return Subtarget.useSoftFloat();

2625

}

2626

2627

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2628

ArgListTy &Args) const {

2629

2630

// Only relabel X86-32 for C / Stdcall CCs.

2631

if (Subtarget.is64Bit())

2632

return;

2633

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2634

return;

2635

unsigned ParamRegs = 0;

2636

if (auto *M = MF->getFunction().getParent())

2637

ParamRegs = M->getNumberRegisterParameters();

2638

2639

// Mark the first N int arguments as having reg

2640

for (auto &Arg : Args) {

2641

Type *T = Arg.Ty;

2642

if (T->isIntOrPtrTy())

2643

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2644

unsigned numRegs = 1;

2645

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2646

numRegs = 2;

2647

if (ParamRegs < numRegs)

2648

return;

2649

ParamRegs -= numRegs;

2650

Arg.IsInReg = true;

2651

}

2652

}

2653

}

2654

2655

const MCExpr *

2656

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2657

const MachineBasicBlock *MBB,

2658

unsigned uid,MCContext &Ctx) const{

2659

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2659, __extension__
__PRETTY_FUNCTION__));

2660

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2661

// entries.

2662

return MCSymbolRefExpr::create(MBB->getSymbol(),

2663

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2664

}

2665

2666

/// Returns relocation base for the given PIC jumptable.

2667

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2668

SelectionDAG &DAG) const {

2669

if (!Subtarget.is64Bit())

2670

// This doesn't have SDLoc associated with it, but is not really the

2671

// same as a Register.

2672

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2673

getPointerTy(DAG.getDataLayout()));

2674

return Table;

2675

}

2676

2677

/// This returns the relocation base for the given PIC jumptable,

2678

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2679

const MCExpr *X86TargetLowering::

2680

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2681

MCContext &Ctx) const {

2682

// X86-64 uses RIP relative addressing based on the jump table label.

2683

if (Subtarget.isPICStyleRIPRel())

2684

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2685

2686

// Otherwise, the reference is relative to the PIC base.

2687

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2688

}

2689

2690

std::pair<const TargetRegisterClass *, uint8_t>

2691

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2692

MVT VT) const {

2693

const TargetRegisterClass *RRC = nullptr;

2694

uint8_t Cost = 1;

2695

switch (VT.SimpleTy) {

2696

default:

2697

return TargetLowering::findRepresentativeClass(TRI, VT);

2698

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2699

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2700

break;

2701

case MVT::x86mmx:

2702

RRC = &X86::VR64RegClass;

2703

break;

2704

case MVT::f32: case MVT::f64:

2705

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2706

case MVT::v4f32: case MVT::v2f64:

2707

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2708

case MVT::v8f32: case MVT::v4f64:

2709

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2710

case MVT::v16f32: case MVT::v8f64:

2711

RRC = &X86::VR128XRegClass;

2712

break;

2713

}

2714

return std::make_pair(RRC, Cost);

2715

}

2716

2717

unsigned X86TargetLowering::getAddressSpace() const {

2718

if (Subtarget.is64Bit())

2719

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

2720

return 256;

2721

}

2722

2723

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

2724

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

2725

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

2726

}

2727

2728

static Constant* SegmentOffset(IRBuilderBase &IRB,

2729

int Offset, unsigned AddressSpace) {

2730

return ConstantExpr::getIntToPtr(

2731

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

2732

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

2733

}

2734

2735

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

2736

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

2737

// tcbhead_t; use it instead of the usual global variable (see

2738

// sysdeps/{i386,x86_64}/nptl/tls.h)

2739

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

2740

if (Subtarget.isTargetFuchsia()) {

2741

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

2742

return SegmentOffset(IRB, 0x10, getAddressSpace());

2743

} else {

2744

unsigned AddressSpace = getAddressSpace();

2745

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

2746

// Specially, some users may customize the base reg and offset.

2747

int Offset = M->getStackProtectorGuardOffset();

2748

// If we don't set -stack-protector-guard-offset value:

2749

// %fs:0x28, unless we're using a Kernel code model, in which case

2750

// it's %gs:0x28. gs:0x14 on i386.

2751

if (Offset == INT_MAX2147483647)

2752

Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

2753

2754

StringRef GuardReg = M->getStackProtectorGuardReg();

2755

if (GuardReg == "fs")

2756

AddressSpace = X86AS::FS;

2757

else if (GuardReg == "gs")

2758

AddressSpace = X86AS::GS;

2759

return SegmentOffset(IRB, Offset, AddressSpace);

2760

}

2761

}

2762

return TargetLowering::getIRStackGuard(IRB);

2763

}

2764

2765

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

2766

// MSVC CRT provides functionalities for stack protection.

2767

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

2768

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

2769

// MSVC CRT has a global variable holding security cookie.

2770

M.getOrInsertGlobal("__security_cookie",

2771

Type::getInt8PtrTy(M.getContext()));

2772

2773

// MSVC CRT has a function to validate security cookie.

2774

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

2775

"__security_check_cookie", Type::getVoidTy(M.getContext()),

2776

Type::getInt8PtrTy(M.getContext()));

2777

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

2778

F->setCallingConv(CallingConv::X86_FastCall);

2779

F->addParamAttr(0, Attribute::AttrKind::InReg);

2780

}

2781

return;

2782

}

2783

2784

StringRef GuardMode = M.getStackProtectorGuard();

2785

2786

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

2787

if ((GuardMode == "tls" || GuardMode.empty()) &&

2788

hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

2789

return;

2790

TargetLowering::insertSSPDeclarations(M);

2791

}

2792

2793

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

2794

// MSVC CRT has a global variable holding security cookie.

2795

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

2796

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

2797

return M.getGlobalVariable("__security_cookie");

2798

}

2799

return TargetLowering::getSDagStackGuard(M);

2800

}

2801

2802

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

2803

// MSVC CRT has a function to validate security cookie.

2804

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

2805

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

2806

return M.getFunction("__security_check_cookie");

2807

}

2808

return TargetLowering::getSSPStackGuardCheck(M);

2809

}

2810

2811

Value *

2812

X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

2813

if (Subtarget.getTargetTriple().isOSContiki())

2814

return getDefaultSafeStackPointerLocation(IRB, false);

2815

2816

// Android provides a fixed TLS slot for the SafeStack pointer. See the

2817

// definition of TLS_SLOT_SAFESTACK in

2818

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

2819

if (Subtarget.isTargetAndroid()) {

2820

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

2821

// %gs:0x24 on i386

2822

int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

2823

return SegmentOffset(IRB, Offset, getAddressSpace());

2824

}

2825

2826

// Fuchsia is similar.

2827

if (Subtarget.isTargetFuchsia()) {

2828

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

2829

return SegmentOffset(IRB, 0x18, getAddressSpace());

2830

}

2831

2832

return TargetLowering::getSafeStackPointerLocation(IRB);

2833

}

2834

2835

//===----------------------------------------------------------------------===//

2836

// Return Value Calling Convention Implementation

2837

//===----------------------------------------------------------------------===//

2838

2839

bool X86TargetLowering::CanLowerReturn(

2840

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

2841

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

2842

SmallVector<CCValAssign, 16> RVLocs;

2843

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

2844

return CCInfo.CheckReturn(Outs, RetCC_X86);

2845

}

2846

2847

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

2848

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

2849

return ScratchRegs;

2850

}

2851

2852

/// Lowers masks values (v*i1) to the local register values

2853

/// \returns DAG node after lowering to register type

2854

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

2855

const SDLoc &Dl, SelectionDAG &DAG) {

2856

EVT ValVT = ValArg.getValueType();

2857

2858

if (ValVT == MVT::v1i1)

2859

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

2860

DAG.getIntPtrConstant(0, Dl));

2861

2862

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

2863

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

2864

// Two stage lowering might be required

2865

// bitcast: v8i1 -> i8 / v16i1 -> i16

2866

// anyextend: i8 -> i32 / i16 -> i32

2867

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

2868

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

2869

if (ValLoc == MVT::i32)

2870

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

2871

return ValToCopy;

2872

}

2873

2874

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

2875

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

2876

// One stage lowering is required

2877

// bitcast: v32i1 -> i32 / v64i1 -> i64

2878

return DAG.getBitcast(ValLoc, ValArg);

2879

}

2880

2881

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

2882

}

2883

2884

/// Breaks v64i1 value into two registers and adds the new node to the DAG

2885

static void Passv64i1ArgInRegs(

2886

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

2887

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

2888

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

2889

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2889, __extension__
__PRETTY_FUNCTION__));

2890

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2890, __extension__
__PRETTY_FUNCTION__));

2891

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2891, __extension__
__PRETTY_FUNCTION__));

2892

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2893, __extension__
__PRETTY_FUNCTION__))

2893

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2893, __extension__
__PRETTY_FUNCTION__));

2894

2895

// Before splitting the value we cast it to i64

2896

Arg = DAG.getBitcast(MVT::i64, Arg);

2897

2898

// Splitting the value into two i32 types

2899

SDValue Lo, Hi;

2900

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

2901

DAG.getConstant(0, Dl, MVT::i32));

2902

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

2903

DAG.getConstant(1, Dl, MVT::i32));

2904

2905

// Attach the two i32 types into corresponding registers

2906

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

2907

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

2908

}

2909

2910

SDValue

2911

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

2912

bool isVarArg,

2913

const SmallVectorImpl<ISD::OutputArg> &Outs,

2914

const SmallVectorImpl<SDValue> &OutVals,

2915

const SDLoc &dl, SelectionDAG &DAG) const {

2916

MachineFunction &MF = DAG.getMachineFunction();

2917

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

2918

2919

// In some cases we need to disable registers from the default CSR list.

2920

// For example, when they are used for argument passing.

2921

bool ShouldDisableCalleeSavedRegister =

2922

CallConv == CallingConv::X86_RegCall ||

2923

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

2924

2925

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

2926

report_fatal_error("X86 interrupts may not return any value");

2927

2928

SmallVector<CCValAssign, 16> RVLocs;

2929

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

2930

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

2931

2932

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

2933

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

2934

++I, ++OutsIndex) {

2935

CCValAssign &VA = RVLocs[I];

2936

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2936, __extension__
__PRETTY_FUNCTION__));

2937

2938

// Add the register to the CalleeSaveDisableRegs list.

2939

if (ShouldDisableCalleeSavedRegister)

2940

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

2941

2942

SDValue ValToCopy = OutVals[OutsIndex];

2943

EVT ValVT = ValToCopy.getValueType();

2944

2945

// Promote values to the appropriate types.

2946

if (VA.getLocInfo() == CCValAssign::SExt)

2947

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

2948

else if (VA.getLocInfo() == CCValAssign::ZExt)

2949

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

2950

else if (VA.getLocInfo() == CCValAssign::AExt) {

2951

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

2952

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

2953

else

2954

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

2955

}

2956

else if (VA.getLocInfo() == CCValAssign::BCvt)

2957

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

2958

2959

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2960, __extension__
__PRETTY_FUNCTION__))

2960

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2960, __extension__
__PRETTY_FUNCTION__));

2961

2962

// Report an error if we have attempted to return a value via an XMM

2963

// register and SSE was disabled.

2964

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

2965

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

2966

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2967

} else if (!Subtarget.hasSSE2() &&

2968

X86::FR64XRegClass.contains(VA.getLocReg()) &&

2969

ValVT == MVT::f64) {

2970

// When returning a double via an XMM register, report an error if SSE2 is

2971

// not enabled.

2972

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

2973

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

2974

}

2975

2976

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

2977

// the RET instruction and handled by the FP Stackifier.

2978

if (VA.getLocReg() == X86::FP0 ||

2979

VA.getLocReg() == X86::FP1) {

2980

// If this is a copy from an xmm register to ST(0), use an FPExtend to

2981

// change the value to the FP stack register class.

2982

if (isScalarFPTypeInSSEReg(VA.getValVT()))

2983

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

2984

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

2985

// Don't emit a copytoreg.

2986

continue;

2987

}

2988

2989

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

2990

// which is returned in RAX / RDX.

2991

if (Subtarget.is64Bit()) {

2992

if (ValVT == MVT::x86mmx) {

2993

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

2994

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

2995

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

2996

ValToCopy);

2997

// If we don't have SSE2 available, convert to v4f32 so the generated

2998

// register is legal.

2999

if (!Subtarget.hasSSE2())

3000

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

3001

}

3002

}

3003

}

3004

3005

if (VA.needsCustom()) {

3006

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3007, __extension__
__PRETTY_FUNCTION__))

3007

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3007, __extension__
__PRETTY_FUNCTION__));

3008

3009

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

3010

Subtarget);

3011

3012

// Add the second register to the CalleeSaveDisableRegs list.

3013

if (ShouldDisableCalleeSavedRegister)

3014

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

3015

} else {

3016

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3017

}

3018

}

3019

3020

SDValue Flag;

3021

SmallVector<SDValue, 6> RetOps;

3022

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

3023

// Operand #1 = Bytes To Pop

3024

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

3025

MVT::i32));

3026

3027

// Copy the result values into the output registers.

3028

for (auto &RetVal : RetVals) {

3029

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

3030

RetOps.push_back(RetVal.second);

3031

continue; // Don't emit a copytoreg.

3032

}

3033

3034

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);

3035

Flag = Chain.getValue(1);

3036

RetOps.push_back(

3037

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

3038

}

3039

3040

// Swift calling convention does not require we copy the sret argument

3041

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

3042

3043

// All x86 ABIs require that for returning structs by value we copy

3044

// the sret argument into %rax/%eax (depending on ABI) for the return.

3045

// We saved the argument into a virtual register in the entry block,

3046

// so now we copy the value out and into %rax/%eax.

3047

//

3048

// Checking Function.hasStructRetAttr() here is insufficient because the IR

3049

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

3050

// false, then an sret argument may be implicitly inserted in the SelDAG. In

3051

// either case FuncInfo->setSRetReturnReg() will have been called.

3052

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

3053

// When we have both sret and another return value, we should use the

3054

// original Chain stored in RetOps[0], instead of the current Chain updated

3055

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

3056

3057

// For the case of sret and another return value, we have

3058

// Chain_0 at the function entry

3059

// Chain_1 = getCopyToReg(Chain_0) in the above loop

3060

// If we use Chain_1 in getCopyFromReg, we will have

3061

// Val = getCopyFromReg(Chain_1)

3062

// Chain_2 = getCopyToReg(Chain_1, Val) from below

3063

3064

// getCopyToReg(Chain_0) will be glued together with

3065

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

3066

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

3067

// Data dependency from Unit B to Unit A due to usage of Val in

3068

// getCopyToReg(Chain_1, Val)

3069

// Chain dependency from Unit A to Unit B

3070

3071

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

3072

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

3073

getPointerTy(MF.getDataLayout()));

3074

3075

Register RetValReg

3076

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

3077

X86::RAX : X86::EAX;

3078

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

3079

Flag = Chain.getValue(1);

3080

3081

// RAX/EAX now acts like a return value.

3082

RetOps.push_back(

3083

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

3084

3085

// Add the returned register to the CalleeSaveDisableRegs list.

3086

if (ShouldDisableCalleeSavedRegister)

3087

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

3088

}

3089

3090

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

3091

const MCPhysReg *I =

3092

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

3093

if (I) {

3094

for (; *I; ++I) {

3095

if (X86::GR64RegClass.contains(*I))

3096

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

3097

else

3098

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3098);

3099

}

3100

}

3101

3102

RetOps[0] = Chain; // Update chain.

3103

3104

// Add the flag if we have it.

3105

if (Flag.getNode())

3106

RetOps.push_back(Flag);

3107

3108

X86ISD::NodeType opcode = X86ISD::RET_FLAG;

3109

if (CallConv == CallingConv::X86_INTR)

3110

opcode = X86ISD::IRET;

3111

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

3112

}

3113

3114

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

3115

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

3116

return false;

3117

3118

SDValue TCChain = Chain;

3119

SDNode *Copy = *N->use_begin();

3120

if (Copy->getOpcode() == ISD::CopyToReg) {

3121

// If the copy has a glue operand, we conservatively assume it isn't safe to

3122

// perform a tail call.

3123

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

3124

return false;

3125

TCChain = Copy->getOperand(0);

3126

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

3127

return false;

3128

3129

bool HasRet = false;

3130

for (const SDNode *U : Copy->uses()) {

3131

if (U->getOpcode() != X86ISD::RET_FLAG)

3132

return false;

3133

// If we are returning more than one value, we can definitely

3134

// not make a tail call see PR19530

3135

if (U->getNumOperands() > 4)

3136

return false;

3137

if (U->getNumOperands() == 4 &&

3138

U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)

3139

return false;

3140

HasRet = true;

3141

}

3142

3143

if (!HasRet)

3144

return false;

3145

3146

Chain = TCChain;

3147

return true;

3148

}

3149

3150

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

3151

ISD::NodeType ExtendKind) const {

3152

MVT ReturnMVT = MVT::i32;

3153

3154

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

3155

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

3156

// The ABI does not require i1, i8 or i16 to be extended.

3157

//

3158

// On Darwin, there is code in the wild relying on Clang's old behaviour of

3159

// always extending i8/i16 return values, so keep doing that for now.

3160

// (PR26665).

3161

ReturnMVT = MVT::i8;

3162

}

3163

3164

EVT MinVT = getRegisterType(Context, ReturnMVT);

3165

return VT.bitsLT(MinVT) ? MinVT : VT;

3166

}

3167

3168

/// Reads two 32 bit registers and creates a 64 bit mask value.

3169

/// \param VA The current 32 bit value that need to be assigned.

3170

/// \param NextVA The next 32 bit value that need to be assigned.

3171

/// \param Root The parent DAG node.

3172

/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for

3173

/// glue purposes. In the case the DAG is already using

3174

/// physical register instead of virtual, we should glue

3175

/// our new SDValue to InFlag SDvalue.

3176

/// \return a new SDvalue of size 64bit.

3177

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

3178

SDValue &Root, SelectionDAG &DAG,

3179

const SDLoc &Dl, const X86Subtarget &Subtarget,

3180

SDValue *InFlag = nullptr) {

3181

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3181, __extension__
__PRETTY_FUNCTION__));

3182

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__));

3183

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))

3184

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__));

3185

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))

3186

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__));

3187

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3188, __extension__
__PRETTY_FUNCTION__))

3188

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3188, __extension__
__PRETTY_FUNCTION__));

3189

3190

SDValue Lo, Hi;

3191

SDValue ArgValueLo, ArgValueHi;

3192

3193

MachineFunction &MF = DAG.getMachineFunction();

3194

const TargetRegisterClass *RC = &X86::GR32RegClass;

3195

3196

// Read a 32 bit value from the registers.

3197

if (nullptr == InFlag) {

3198

// When no physical register is present,

3199

// create an intermediate virtual register.

3200

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3201

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3202

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

3203

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3204

} else {

3205

// When a physical register is available read the value from it and glue

3206

// the reads together.

3207

ArgValueLo =

3208

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);

3209

*InFlag = ArgValueLo.getValue(2);

3210

ArgValueHi =

3211

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);

3212

*InFlag = ArgValueHi.getValue(2);

3213

}

3214

3215

// Convert the i32 type into v32i1 type.

3216

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

3217

3218

// Convert the i32 type into v32i1 type.

3219

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

3220

3221

// Concatenate the two values together.

3222

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

3223

}

3224

3225

/// The function will lower a register of various sizes (8/16/32/64)

3226

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

3227

/// \returns a DAG node contains the operand after lowering to mask type.

3228

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

3229

const EVT &ValLoc, const SDLoc &Dl,

3230

SelectionDAG &DAG) {

3231

SDValue ValReturned = ValArg;

3232

3233

if (ValVT == MVT::v1i1)

3234

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

3235

3236

if (ValVT == MVT::v64i1) {

3237

// In 32 bit machine, this case is handled by getv64i1Argument

3238

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3238, __extension__
__PRETTY_FUNCTION__));

3239

// In 64 bit machine, There is no need to truncate the value only bitcast

3240

} else {

3241

MVT maskLen;

3242

switch (ValVT.getSimpleVT().SimpleTy) {

3243

case MVT::v8i1:

3244

maskLen = MVT::i8;

3245

break;

3246

case MVT::v16i1:

3247

maskLen = MVT::i16;

3248

break;

3249

case MVT::v32i1:

3250

maskLen = MVT::i32;

3251

break;

3252

default:

3253

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3253);

3254

}

3255

3256

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

3257

}

3258

return DAG.getBitcast(ValVT, ValReturned);

3259

}

3260

3261

/// Lower the result values of a call into the

3262

/// appropriate copies out of appropriate physical registers.

3263

///

3264

SDValue X86TargetLowering::LowerCallResult(

3265

SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,

3266

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3267

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

3268

uint32_t *RegMask) const {

3269

3270

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3271

// Assign locations to each value returned by this call.

3272

SmallVector<CCValAssign, 16> RVLocs;

3273

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

3274

*DAG.getContext());

3275

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3276

3277

// Copy all of the result registers out of their specified physreg.

3278

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

3279

++I, ++InsIndex) {

3280

CCValAssign &VA = RVLocs[I];

3281

EVT CopyVT = VA.getLocVT();

3282

3283

// In some calling conventions we need to remove the used registers

3284

// from the register mask.

3285

if (RegMask) {

3286

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

3287

SubRegs.isValid(); ++SubRegs)

3288

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3289

}

3290

3291

// Report an error if there was an attempt to return FP values via XMM

3292

// registers.

3293

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3294

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3295

if (VA.getLocReg() == X86::XMM1)

3296

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3297

else

3298

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3299

} else if (!Subtarget.hasSSE2() &&

3300

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3301

CopyVT == MVT::f64) {

3302

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3303

if (VA.getLocReg() == X86::XMM1)

3304

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3305

else

3306

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3307

}

3308

3309

// If we prefer to use the value in xmm registers, copy it out as f80 and

3310

// use a truncate to move it from fp stack reg to xmm reg.

3311

bool RoundAfterCopy = false;

3312

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3313

isScalarFPTypeInSSEReg(VA.getValVT())) {

3314

if (!Subtarget.hasX87())

3315

report_fatal_error("X87 register return with X87 disabled");

3316

CopyVT = MVT::f80;

3317

RoundAfterCopy = (CopyVT != VA.getLocVT());

3318

}

3319

3320

SDValue Val;

3321

if (VA.needsCustom()) {

3322

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3323, __extension__
__PRETTY_FUNCTION__))

3323

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3323, __extension__
__PRETTY_FUNCTION__));

3324

Val =

3325

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);

3326

} else {

3327

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)

3328

.getValue(1);

3329

Val = Chain.getValue(0);

3330

InFlag = Chain.getValue(2);

3331

}

3332

3333

if (RoundAfterCopy)

3334

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3335

// This truncation won't change the value.

3336

DAG.getIntPtrConstant(1, dl));

3337

3338

if (VA.isExtInLoc()) {

3339

if (VA.getValVT().isVector() &&

3340

VA.getValVT().getScalarType() == MVT::i1 &&

3341

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3342

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3343

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3344

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3345

} else

3346

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3347

}

3348

3349

if (VA.getLocInfo() == CCValAssign::BCvt)

3350

Val = DAG.getBitcast(VA.getValVT(), Val);

3351

3352

InVals.push_back(Val);

3353

}

3354

3355

return Chain;

3356

}

3357

3358

//===----------------------------------------------------------------------===//

3359

// C & StdCall & Fast Calling Convention implementation

3360

//===----------------------------------------------------------------------===//

3361

// StdCall calling convention seems to be standard for many Windows' API

3362

// routines and around. It differs from C calling convention just a little:

3363

// callee should clean up the stack, not caller. Symbols should be also

3364

// decorated in some fancy way :) It doesn't support any vector arguments.

3365

// For info on fast calling convention see Fast Calling Convention (tail call)

3366

// implementation LowerX86_32FastCCCallTo.

3367

3368

/// Determines whether Args, either a set of outgoing arguments to a call, or a

3369

/// set of incoming args of a call, contains an sret pointer that the callee

3370

/// pops

3371

template <typename T>

3372

static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,

3373

const X86Subtarget &Subtarget) {

3374

// Not C++20 (yet), so no concepts available.

3375

static_assert(std::is_same<T, ISD::OutputArg>::value ||

3376

std::is_same<T, ISD::InputArg>::value,

3377

"requires ISD::OutputArg or ISD::InputArg");

3378

3379

// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out

3380

// for most compilations.

3381

if (!Subtarget.is32Bit())

3382

return false;

3383

3384

if (Args.empty())

3385

return false;

3386

3387

// Most calls do not have an sret argument, check the arg next.

3388

const ISD::ArgFlagsTy &Flags = Args[0].Flags;

3389

if (!Flags.isSRet() || Flags.isInReg())

3390

return false;

3391

3392

// The MSVCabi does not pop the sret.

3393

if (Subtarget.getTargetTriple().isOSMSVCRT())

3394

return false;

3395

3396

// MCUs don't pop the sret

3397

if (Subtarget.isTargetMCU())

3398

return false;

3399

3400

// Callee pops argument

3401

return true;

3402

}

3403

3404

/// Make a copy of an aggregate at address specified by "Src" to address

3405

/// "Dst" with size and alignment information specified by the specific

3406

/// parameter attribute. The copy will be passed as a byval function parameter.

3407

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3408

SDValue Chain, ISD::ArgFlagsTy Flags,

3409

SelectionDAG &DAG, const SDLoc &dl) {

3410

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3411

3412

return DAG.getMemcpy(

3413

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3414

/*isVolatile*/ false, /*AlwaysInline=*/true,

3415

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3416

}

3417

3418

/// Return true if the calling convention is one that we can guarantee TCO for.

3419

static bool canGuaranteeTCO(CallingConv::ID CC) {

3420

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3421

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3422

CC == CallingConv::HHVM || CC == CallingConv::Tail ||

3423

CC == CallingConv::SwiftTail);

3424

}

3425

3426

/// Return true if we might ever do TCO for calls with this calling convention.

3427

static bool mayTailCallThisCC(CallingConv::ID CC) {

3428

switch (CC) {

3429

// C calling conventions:

3430

case CallingConv::C:

3431

case CallingConv::Win64:

3432

case CallingConv::X86_64_SysV:

3433

// Callee pop conventions:

3434

case CallingConv::X86_ThisCall:

3435

case CallingConv::X86_StdCall:

3436

case CallingConv::X86_VectorCall:

3437

case CallingConv::X86_FastCall:

3438

// Swift:

3439

case CallingConv::Swift:

3440

return true;

3441

default:

3442

return canGuaranteeTCO(CC);

3443

}

3444

}

3445

3446

/// Return true if the function is being made into a tailcall target by

3447

/// changing its ABI.

3448

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3449

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||

3450

CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

3451

}

3452

3453

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3454

if (!CI->isTailCall())

3455

return false;

3456

3457

CallingConv::ID CalleeCC = CI->getCallingConv();

3458

if (!mayTailCallThisCC(CalleeCC))

3459

return false;

3460

3461

return true;

3462

}

3463

3464

SDValue

3465

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3466

const SmallVectorImpl<ISD::InputArg> &Ins,

3467

const SDLoc &dl, SelectionDAG &DAG,

3468

const CCValAssign &VA,

3469

MachineFrameInfo &MFI, unsigned i) const {

3470

// Create the nodes corresponding to a load from this parameter slot.

3471

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3472

bool AlwaysUseMutable = shouldGuaranteeTCO(

3473

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3474

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3475

EVT ValVT;

3476

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3477

3478

// If value is passed by pointer we have address passed instead of the value

3479

// itself. No need to extend if the mask value and location share the same

3480

// absolute size.

3481

bool ExtendedInMem =

3482

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3483

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3484

3485

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3486

ValVT = VA.getLocVT();

3487

else

3488

ValVT = VA.getValVT();

3489

3490

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3491

// changed with more analysis.

3492

// In case of tail call optimization mark all arguments mutable. Since they

3493

// could be overwritten by lowering of arguments in case of a tail call.

3494

if (Flags.isByVal()) {

3495

unsigned Bytes = Flags.getByValSize();

3496

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3497

3498

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3499

// can be improved with deeper analysis.

3500

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3501

/*isAliased=*/true);

3502

return DAG.getFrameIndex(FI, PtrVT);

3503

}

3504

3505

EVT ArgVT = Ins[i].ArgVT;

3506

3507

// If this is a vector that has been split into multiple parts, and the

3508

// scalar size of the parts don't match the vector element size, then we can't

3509

// elide the copy. The parts will have padding between them instead of being

3510

// packed like a vector.

3511

bool ScalarizedAndExtendedVector =

3512

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3513

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3514

3515

// This is an argument in memory. We might be able to perform copy elision.

3516

// If the argument is passed directly in memory without any extension, then we

3517

// can perform copy elision. Large vector types, for example, may be passed

3518

// indirectly by pointer.

3519

if (Flags.isCopyElisionCandidate() &&

3520

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3521

!ScalarizedAndExtendedVector) {

3522

SDValue PartAddr;

3523

if (Ins[i].PartOffset == 0) {

3524

// If this is a one-part value or the first part of a multi-part value,

3525

// create a stack object for the entire argument value type and return a

3526

// load from our portion of it. This assumes that if the first part of an

3527

// argument is in memory, the rest will also be in memory.

3528

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3529

/*IsImmutable=*/false);

3530

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3531

return DAG.getLoad(

3532

ValVT, dl, Chain, PartAddr,

3533

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3534

} else {

3535

// This is not the first piece of an argument in memory. See if there is

3536

// already a fixed stack object including this offset. If so, assume it

3537

// was created by the PartOffset == 0 branch above and create a load from

3538

// the appropriate offset into it.

3539

int64_t PartBegin = VA.getLocMemOffset();

3540

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3541

int FI = MFI.getObjectIndexBegin();

3542

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3543

int64_t ObjBegin = MFI.getObjectOffset(FI);

3544

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3545

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3546

break;

3547

}

3548

if (MFI.isFixedObjectIndex(FI)) {

3549

SDValue Addr =

3550

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3551

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3552

return DAG.getLoad(

3553

ValVT, dl, Chain, Addr,

3554

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3555

Ins[i].PartOffset));

3556

}

3557

}

3558

}

3559

3560

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3561

VA.getLocMemOffset(), isImmutable);

3562

3563

// Set SExt or ZExt flag.

3564

if (VA.getLocInfo() == CCValAssign::ZExt) {

3565

MFI.setObjectZExt(FI, true);

3566

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3567

MFI.setObjectSExt(FI, true);

3568

}

3569

3570

MaybeAlign Alignment;

3571

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

3572

ValVT != MVT::f80)

3573

Alignment = MaybeAlign(4);

3574

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3575

SDValue Val = DAG.getLoad(

3576

ValVT, dl, Chain, FIN,

3577

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

3578

Alignment);

3579

return ExtendedInMem

3580

? (VA.getValVT().isVector()

3581

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3582

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3583

: Val;

3584

}

3585

3586

// FIXME: Get this from tablegen.

3587

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3588

const X86Subtarget &Subtarget) {

3589

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3589, __extension__ __PRETTY_FUNCTION__));

3590

3591

if (Subtarget.isCallingConvWin64(CallConv)) {

3592

static const MCPhysReg GPR64ArgRegsWin64[] = {

3593

X86::RCX, X86::RDX, X86::R8, X86::R9

3594

};

3595

return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3596

}

3597

3598

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3599

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3600

};

3601

return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3602

}

3603

3604

// FIXME: Get this from tablegen.

3605

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3606

CallingConv::ID CallConv,

3607

const X86Subtarget &Subtarget) {

3608

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3608, __extension__ __PRETTY_FUNCTION__));

3609

if (Subtarget.isCallingConvWin64(CallConv)) {

3610

// The XMM registers which might contain var arg parameters are shadowed

3611

// in their paired GPR. So we only need to save the GPR to their home

3612

// slots.

3613

// TODO: __vectorcall will change this.

3614

return None;

3615

}

3616

3617

bool isSoftFloat = Subtarget.useSoftFloat();

3618

if (isSoftFloat || !Subtarget.hasSSE1())

3619

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3620

// registers.

3621

return None;

3622

3623

static const MCPhysReg XMMArgRegs64Bit[] = {

3624

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3625

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3626

};

3627

return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3628

}

3629

3630

#ifndef NDEBUG

3631

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3632

return llvm::is_sorted(

3633

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3634

return A.getValNo() < B.getValNo();

3635

});

3636

}

3637

#endif

3638

3639

namespace {

3640

/// This is a helper class for lowering variable arguments parameters.

3641

class VarArgsLoweringHelper {

3642

public:

3643

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3644

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3645

CallingConv::ID CallConv, CCState &CCInfo)

3646

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3647

TheMachineFunction(DAG.getMachineFunction()),

3648

TheFunction(TheMachineFunction.getFunction()),

3649

FrameInfo(TheMachineFunction.getFrameInfo()),

3650

FrameLowering(*Subtarget.getFrameLowering()),

3651

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3652

CCInfo(CCInfo) {}

3653

3654

// Lower variable arguments parameters.

3655

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3656

3657

private:

3658

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3659

3660

void forwardMustTailParameters(SDValue &Chain);

3661

3662

bool is64Bit() const { return Subtarget.is64Bit(); }

3663

bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

3664

3665

X86MachineFunctionInfo *FuncInfo;

3666

const SDLoc &DL;

3667

SelectionDAG &DAG;

3668

const X86Subtarget &Subtarget;

3669

MachineFunction &TheMachineFunction;

3670

const Function &TheFunction;

3671

MachineFrameInfo &FrameInfo;

3672

const TargetFrameLowering &FrameLowering;

3673

const TargetLowering &TargLowering;

3674

CallingConv::ID CallConv;

3675

CCState &CCInfo;

3676

};

3677

} // namespace

3678

3679

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3680

SDValue &Chain, unsigned StackSize) {

3681

// If the function takes variable number of arguments, make a frame index for

3682

// the start of the first vararg value... for expansion of llvm.va_start. We

3683

// can skip this if there are no va_start calls.

3684

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3685

CallConv != CallingConv::X86_ThisCall)) {

3686

FuncInfo->setVarArgsFrameIndex(

3687

FrameInfo.CreateFixedObject(1, StackSize, true));

3688

}

3689

3690

// 64-bit calling conventions support varargs and register parameters, so we

3691

// have to do extra work to spill them in the prologue.

3692

if (is64Bit()) {

3693

// Find the first unallocated argument registers.

3694

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3695

ArrayRef<MCPhysReg> ArgXMMs =

3696

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

3697

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3698

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3699

3700

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3701, __extension__
__PRETTY_FUNCTION__))

3701

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3701, __extension__
__PRETTY_FUNCTION__));

3702

3703

if (isWin64()) {

3704

// Get to the caller-allocated home save location. Add 8 to account

3705

// for the return address.

3706

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

3707

FuncInfo->setRegSaveFrameIndex(

3708

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

3709

// Fixup to set vararg frame on shadow area (4 x i64).

3710

if (NumIntRegs < 4)

3711

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

3712

} else {

3713

// For X86-64, if there are vararg parameters that are passed via

3714

// registers, then we must store them to their spots on the stack so

3715

// they may be loaded by dereferencing the result of va_next.

3716

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

3717

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

3718

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

3719

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

3720

}

3721

3722

SmallVector<SDValue, 6>

3723

LiveGPRs; // list of SDValue for GPR registers keeping live input value

3724

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

3725

// keeping live input value

3726

SDValue ALVal; // if applicable keeps SDValue for %al register

3727

3728

// Gather all the live in physical registers.

3729

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

3730

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

3731

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

3732

}

3733

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

3734

if (!AvailableXmms.empty()) {

3735

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

3736

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

3737

for (MCPhysReg Reg : AvailableXmms) {

3738

// FastRegisterAllocator spills virtual registers at basic

3739

// block boundary. That leads to usages of xmm registers

3740

// outside of check for %al. Pass physical registers to

3741

// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.

3742

TheMachineFunction.getRegInfo().addLiveIn(Reg);

3743

LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));

3744

}

3745

}

3746

3747

// Store the integer parameter registers.

3748

SmallVector<SDValue, 8> MemOps;

3749

SDValue RSFIN =

3750

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

3751

TargLowering.getPointerTy(DAG.getDataLayout()));

3752

unsigned Offset = FuncInfo->getVarArgsGPOffset();

3753

for (SDValue Val : LiveGPRs) {

3754

SDValue FIN = DAG.getNode(ISD::ADD, DL,

3755

TargLowering.getPointerTy(DAG.getDataLayout()),

3756

RSFIN, DAG.getIntPtrConstant(Offset, DL));

3757

SDValue Store =

3758

DAG.getStore(Val.getValue(1), DL, Val, FIN,

3759

MachinePointerInfo::getFixedStack(

3760

DAG.getMachineFunction(),

3761

FuncInfo->getRegSaveFrameIndex(), Offset));

3762

MemOps.push_back(Store);

3763

Offset += 8;

3764

}

3765

3766

// Now store the XMM (fp + vector) parameter registers.

3767

if (!LiveXMMRegs.empty()) {

3768

SmallVector<SDValue, 12> SaveXMMOps;

3769

SaveXMMOps.push_back(Chain);

3770

SaveXMMOps.push_back(ALVal);

3771

SaveXMMOps.push_back(RSFIN);

3772

SaveXMMOps.push_back(

3773

DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));

3774

llvm::append_range(SaveXMMOps, LiveXMMRegs);

3775

MachineMemOperand *StoreMMO =

3776

DAG.getMachineFunction().getMachineMemOperand(

3777

MachinePointerInfo::getFixedStack(

3778

DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),

3779

Offset),

3780

MachineMemOperand::MOStore, 128, Align(16));

3781

MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,

3782

DL, DAG.getVTList(MVT::Other),

3783

SaveXMMOps, MVT::i8, StoreMMO));

3784

}

3785

3786

if (!MemOps.empty())

3787

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

3788

}

3789

}

3790

3791

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

3792

// Find the largest legal vector type.

3793

MVT VecVT = MVT::Other;

3794

// FIXME: Only some x86_32 calling conventions support AVX512.

3795

if (Subtarget.useAVX512Regs() &&

3796

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

3797

CallConv == CallingConv::Intel_OCL_BI)))

3798

VecVT = MVT::v16f32;

3799

else if (Subtarget.hasAVX())

3800

VecVT = MVT::v8f32;

3801

else if (Subtarget.hasSSE2())

3802

VecVT = MVT::v4f32;

3803

3804

// We forward some GPRs and some vector types.

3805

SmallVector<MVT, 2> RegParmTypes;

3806

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

3807

RegParmTypes.push_back(IntVT);

3808

if (VecVT != MVT::Other)

3809

RegParmTypes.push_back(VecVT);

3810

3811

// Compute the set of forwarded registers. The rest are scratch.

3812

SmallVectorImpl<ForwardedRegister> &Forwards =

3813

FuncInfo->getForwardedMustTailRegParms();

3814

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

3815

3816

// Forward AL for SysV x86_64 targets, since it is used for varargs.

3817

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

3818

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

3819

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

3820

}

3821

3822

// Copy all forwards from physical to virtual registers.

3823

for (ForwardedRegister &FR : Forwards) {

3824

// FIXME: Can we use a less constrained schedule?

3825

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

3826

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

3827

TargLowering.getRegClassFor(FR.VT));

3828

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

3829

}

3830

}

3831

3832

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

3833

unsigned StackSize) {

3834

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

3835

// If necessary, it would be set into the correct value later.

3836

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

3837

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

3838

3839

if (FrameInfo.hasVAStart())

3840

createVarArgAreaAndStoreRegisters(Chain, StackSize);

3841

3842

if (FrameInfo.hasMustTailInVarArgFunc())

3843

forwardMustTailParameters(Chain);

3844

}

3845

3846

SDValue X86TargetLowering::LowerFormalArguments(

3847

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

3848

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3849

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

3850

MachineFunction &MF = DAG.getMachineFunction();

3851

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3852

3853

const Function &F = MF.getFunction();

3854

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

3855

F.getName() == "main")

3856

FuncInfo->setForceFramePointer(true);

3857

3858

MachineFrameInfo &MFI = MF.getFrameInfo();

3859

bool Is64Bit = Subtarget.is64Bit();

3860

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

3861

3862

assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))

3863

!(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__))

3864

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3864, __extension__
__PRETTY_FUNCTION__));

3865

3866

// Assign locations to all of the incoming arguments.

3867

SmallVector<CCValAssign, 16> ArgLocs;

3868

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

3869

3870

// Allocate shadow area for Win64.

3871

if (IsWin64)

3872

CCInfo.AllocateStack(32, Align(8));

3873

3874

CCInfo.AnalyzeArguments(Ins, CC_X86);

3875

3876

// In vectorcall calling convention a second pass is required for the HVA

3877

// types.

3878

if (CallingConv::X86_VectorCall == CallConv) {

3879

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

3880

}

3881

3882

// The next loop assumes that the locations are in the same order of the

3883

// input arguments.

3884

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3885, __extension__
__PRETTY_FUNCTION__))

3885

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3885, __extension__
__PRETTY_FUNCTION__));

3886

3887

SDValue ArgValue;

3888

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

3889

++I, ++InsIndex) {

3890

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3890, __extension__
__PRETTY_FUNCTION__));

3891

CCValAssign &VA = ArgLocs[I];

3892

3893

if (VA.isRegLoc()) {

3894

EVT RegVT = VA.getLocVT();

3895

if (VA.needsCustom()) {

3896

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))

3897

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__))

3898

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3898, __extension__
__PRETTY_FUNCTION__));

3899

3900

// v64i1 values, in regcall calling convention, that are

3901

// compiled to 32 bit arch, are split up into two registers.

3902

ArgValue =

3903

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

3904

} else {

3905

const TargetRegisterClass *RC;

3906

if (RegVT == MVT::i8)

3907

RC = &X86::GR8RegClass;

3908

else if (RegVT == MVT::i16)

3909

RC = &X86::GR16RegClass;

3910

else if (RegVT == MVT::i32)

3911

RC = &X86::GR32RegClass;

3912

else if (Is64Bit && RegVT == MVT::i64)

3913

RC = &X86::GR64RegClass;

3914

else if (RegVT == MVT::f16)

3915

RC = &X86::FR16XRegClass;

3916

else if (RegVT == MVT::f32)

3917

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

3918

else if (RegVT == MVT::f64)

3919

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

3920

else if (RegVT == MVT::f80)

3921

RC = &X86::RFP80RegClass;

3922

else if (RegVT == MVT::f128)

3923

RC = &X86::VR128RegClass;

3924

else if (RegVT.is512BitVector())

3925

RC = &X86::VR512RegClass;

3926

else if (RegVT.is256BitVector())

3927

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

3928

else if (RegVT.is128BitVector())

3929

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

3930

else if (RegVT == MVT::x86mmx)

3931

RC = &X86::VR64RegClass;

3932

else if (RegVT == MVT::v1i1)

3933

RC = &X86::VK1RegClass;

3934

else if (RegVT == MVT::v8i1)

3935

RC = &X86::VK8RegClass;

3936

else if (RegVT == MVT::v16i1)

3937

RC = &X86::VK16RegClass;

3938

else if (RegVT == MVT::v32i1)

3939

RC = &X86::VK32RegClass;

3940

else if (RegVT == MVT::v64i1)

3941

RC = &X86::VK64RegClass;

3942

else

3943

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3943);

3944

3945

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3946

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

3947

}

3948

3949

// If this is an 8 or 16-bit value, it is really passed promoted to 32

3950

// bits. Insert an assert[sz]ext to capture this, then truncate to the

3951

// right size.

3952

if (VA.getLocInfo() == CCValAssign::SExt)

3953

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

3954

DAG.getValueType(VA.getValVT()));

3955

else if (VA.getLocInfo() == CCValAssign::ZExt)

3956

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

3957

DAG.getValueType(VA.getValVT()));

3958

else if (VA.getLocInfo() == CCValAssign::BCvt)

3959

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

3960

3961

if (VA.isExtInLoc()) {

3962

// Handle MMX values passed in XMM regs.

3963

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

3964

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

3965

else if (VA.getValVT().isVector() &&

3966

VA.getValVT().getScalarType() == MVT::i1 &&

3967

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3968

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3969

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3970

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

3971

} else

3972

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

3973

}

3974

} else {

3975

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
3975, __extension__ __PRETTY_FUNCTION__));

3976

ArgValue =

3977

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

3978

}

3979

3980

// If value is passed via pointer - do a load.

3981

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

3982

ArgValue =

3983

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

3984

3985

InVals.push_back(ArgValue);

3986

}

3987

3988

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

3989

if (Ins[I].Flags.isSwiftAsync()) {

3990

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

3991

if (Subtarget.is64Bit())

3992

X86FI->setHasSwiftAsyncContext(true);

3993

else {

3994

int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

3995

X86FI->setSwiftAsyncContextFrameIdx(FI);

3996

SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],

3997

DAG.getFrameIndex(FI, MVT::i32),

3998

MachinePointerInfo::getFixedStack(MF, FI));

3999

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);

4000

}

4001

}

4002

4003

// Swift calling convention does not require we copy the sret argument

4004

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

4005

if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)

4006

continue;

4007

4008

// All x86 ABIs require that for returning structs by value we copy the

4009

// sret argument into %rax/%eax (depending on ABI) for the return. Save

4010

// the argument into a virtual register so that we can access it from the

4011

// return points.

4012

if (Ins[I].Flags.isSRet()) {

4013

assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4014

"SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4014, __extension__
__PRETTY_FUNCTION__));

4015

MVT PtrTy = getPointerTy(DAG.getDataLayout());

4016

Register Reg =

4017

MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

4018

FuncInfo->setSRetReturnReg(Reg);

4019

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

4020

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

4021

break;

4022

}

4023

}

4024

4025

unsigned StackSize = CCInfo.getNextStackOffset();

4026

// Align stack specially for tail calls.

4027

if (shouldGuaranteeTCO(CallConv,

4028

MF.getTarget().Options.GuaranteedTailCallOpt))

4029

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

4030

4031

if (IsVarArg)

4032

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

4033

.lowerVarArgsParameters(Chain, StackSize);

4034

4035

// Some CCs need callee pop.

4036

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

4037

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4038

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

4039

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

4040

// X86 interrupts must pop the error code (and the alignment padding) if

4041

// present.

4042

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

4043

} else {

4044

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

4045

// If this is an sret function, the return should pop the hidden pointer.

4046

if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))

4047

FuncInfo->setBytesToPopOnReturn(4);

4048

}

4049

4050

if (!Is64Bit) {

4051

// RegSaveFrameIndex is X86-64 only.

4052

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4053

}

4054

4055

FuncInfo->setArgumentStackSize(StackSize);

4056

4057

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

4058

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

4059

if (Personality == EHPersonality::CoreCLR) {

4060

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4060,
__extension__ __PRETTY_FUNCTION__));

4061

// TODO: Add a mechanism to frame lowering that will allow us to indicate

4062

// that we'd prefer this slot be allocated towards the bottom of the frame

4063

// (i.e. near the stack pointer after allocating the frame). Every

4064

// funclet needs a copy of this slot in its (mostly empty) frame, and the

4065

// offset from the bottom of this and each funclet's frame must be the

4066

// same, so the size of funclets' (mostly empty) frames is dictated by

4067

// how far this slot is from the bottom (since they allocate just enough

4068

// space to accommodate holding this slot at the correct offset).

4069

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);

4070

EHInfo->PSPSymFrameIdx = PSPSymFI;

4071

}

4072

}

4073

4074

if (CallConv == CallingConv::X86_RegCall ||

4075

F.hasFnAttribute("no_caller_saved_registers")) {

4076

MachineRegisterInfo &MRI = MF.getRegInfo();

4077

for (std::pair<Register, Register> Pair : MRI.liveins())

4078

MRI.disableCalleeSavedRegister(Pair.first);

4079

}

4080

4081

return Chain;

4082

}

4083

4084

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

4085

SDValue Arg, const SDLoc &dl,

4086

SelectionDAG &DAG,

4087

const CCValAssign &VA,

4088

ISD::ArgFlagsTy Flags,

4089

bool isByVal) const {

4090

unsigned LocMemOffset = VA.getLocMemOffset();

4091

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

4092

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4093

StackPtr, PtrOff);

4094

if (isByVal)

4095

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

4096

4097

MaybeAlign Alignment;

4098

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

4099

Arg.getSimpleValueType() != MVT::f80)

4100

Alignment = MaybeAlign(4);

4101

return DAG.getStore(

4102

Chain, dl, Arg, PtrOff,

4103

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),

4104

Alignment);

4105

}

4106

4107

/// Emit a load of return address if tail call

4108

/// optimization is performed and it is required.

4109

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

4110

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

4111

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

4112

// Adjust the Return address stack slot.

4113

EVT VT = getPointerTy(DAG.getDataLayout());

4114

OutRetAddr = getReturnAddressFrameIndex(DAG);

4115

4116

// Load the "old" Return address.

4117

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

4118

return SDValue(OutRetAddr.getNode(), 1);

4119

}

4120

4121

/// Emit a store of the return address if tail call

4122

/// optimization is performed and it is required (FPDiff!=0).

4123

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

4124

SDValue Chain, SDValue RetAddrFrIdx,

4125

EVT PtrVT, unsigned SlotSize,

4126

int FPDiff, const SDLoc &dl) {

4127

// Store the return address to the appropriate stack slot.

4128

if (!FPDiff) return Chain;

4129

// Calculate the new stack slot for the return address.

4130

int NewReturnAddrFI =

4131

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

4132

false);

4133

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

4134

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

4135

MachinePointerInfo::getFixedStack(

4136

DAG.getMachineFunction(), NewReturnAddrFI));

4137

return Chain;

4138

}

4139

4140

/// Returns a vector_shuffle mask for an movs{s|d}, movd

4141

/// operation of specified width.

4142

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

4143

SDValue V2) {

4144

unsigned NumElems = VT.getVectorNumElements();

4145

SmallVector<int, 8> Mask;

4146

Mask.push_back(NumElems);

4147

for (unsigned i = 1; i != NumElems; ++i)

4148

Mask.push_back(i);

4149

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

4150

}

4151

4152

SDValue

4153

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

4154

SmallVectorImpl<SDValue> &InVals) const {

4155

SelectionDAG &DAG = CLI.DAG;

4156

SDLoc &dl = CLI.DL;

4157

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

4158

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

4159

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

4160

SDValue Chain = CLI.Chain;

4161

SDValue Callee = CLI.Callee;

4162

CallingConv::ID CallConv = CLI.CallConv;

4163

bool &isTailCall = CLI.IsTailCall;

4164

bool isVarArg = CLI.IsVarArg;

4165

const auto *CB = CLI.CB;

4166

4167

MachineFunction &MF = DAG.getMachineFunction();

4168

bool Is64Bit = Subtarget.is64Bit();

4169

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4170

bool IsSibcall = false;

4171

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

4172

CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;

4173

bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);

4174

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

4175

bool HasNCSR = (CB && isa<CallInst>(CB) &&

4176

CB->hasFnAttr("no_caller_saved_registers"));

4177

bool HasNoCfCheck = (CB && CB->doesNoCfCheck());

4178

bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());

4179

const Module *M = MF.getMMI().getModule();

4180

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

4181

4182

MachineFunction::CallSiteInfo CSInfo;

4183

if (CallConv == CallingConv::X86_INTR)

4184

report_fatal_error("X86 interrupts may not be called directly");

4185

4186

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

4187

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {

4188

// If we are using a GOT, disable tail calls to external symbols with

4189

// default visibility. Tail calling such a symbol requires using a GOT

4190

// relocation, which forces early binding of the symbol. This breaks code

4191

// that require lazy function symbol resolution. Using musttail or

4192

// GuaranteedTailCallOpt will override this.

4193

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4194

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

4195

G->getGlobal()->hasDefaultVisibility()))

4196

isTailCall = false;

4197

}

4198

4199

if (isTailCall && !IsMustTail) {

4200

// Check if it's really possible to do a tail call.

4201

isTailCall = IsEligibleForTailCallOptimization(

4202

Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,

4203

Ins, DAG);

4204

4205

// Sibcalls are automatically detected tailcalls which do not require

4206

// ABI changes.

4207

if (!IsGuaranteeTCO && isTailCall)

4208

IsSibcall = true;

4209

4210

if (isTailCall)

4211

++NumTailCalls;

4212

}

4213

4214

if (IsMustTail && !isTailCall)

4215

report_fatal_error("failed to perform tail call elimination on a call "

4216

"site marked musttail");

4217

4218

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4219, __extension__
__PRETTY_FUNCTION__))

4219

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4219, __extension__
__PRETTY_FUNCTION__));

4220

4221

// Analyze operands of the call, assigning locations to each operand.

4222

SmallVector<CCValAssign, 16> ArgLocs;

4223

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

4224

4225

// Allocate shadow area for Win64.

4226

if (IsWin64)

4227

CCInfo.AllocateStack(32, Align(8));

4228

4229

CCInfo.AnalyzeArguments(Outs, CC_X86);

4230

4231

// In vectorcall calling convention a second pass is required for the HVA

4232

// types.

4233

if (CallingConv::X86_VectorCall == CallConv) {

4234

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

4235

}

4236

4237

// Get a count of how many bytes are to be pushed on the stack.

4238

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

4239

if (IsSibcall)

4240

// This is a sibcall. The memory operands are available in caller's

4241

// own caller's stack.

4242

NumBytes = 0;

4243

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

4244

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

4245

4246

int FPDiff = 0;

4247

if (isTailCall &&

4248

shouldGuaranteeTCO(CallConv,

4249

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4250

// Lower arguments at fp - stackoffset + fpdiff.

4251

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

4252

4253

FPDiff = NumBytesCallerPushed - NumBytes;

4254

4255

// Set the delta of movement of the returnaddr stackslot.

4256

// But only set if delta is greater than previous delta.

4257

if (FPDiff < X86Info->getTCReturnAddrDelta())

4258

X86Info->setTCReturnAddrDelta(FPDiff);

4259

}

4260

4261

unsigned NumBytesToPush = NumBytes;

4262

unsigned NumBytesToPop = NumBytes;

4263

4264

// If we have an inalloca argument, all stack space has already been allocated

4265

// for us and be right at the top of the stack. We don't support multiple

4266

// arguments passed in memory when using inalloca.

4267

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

4268

NumBytesToPush = 0;

4269

if (!ArgLocs.back().isMemLoc())

4270

report_fatal_error("cannot use inalloca attribute on a register "

4271

"parameter");

4272

if (ArgLocs.back().getLocMemOffset() != 0)

4273

report_fatal_error("any parameter with the inalloca attribute must be "

4274

"the only memory argument");

4275

} else if (CLI.IsPreallocated) {

4276

assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))

4277

"cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__))

4278

"parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4278, __extension__
__PRETTY_FUNCTION__));

4279

SmallVector<size_t, 4> PreallocatedOffsets;

4280

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

4281

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

4282

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

4283

}

4284

}

4285

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

4286

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

4287

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

4288

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

4289

NumBytesToPush = 0;

4290

}

4291

4292

if (!IsSibcall && !IsMustTail)

4293

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

4294

NumBytes - NumBytesToPush, dl);

4295

4296

SDValue RetAddrFrIdx;

4297

// Load return address for tail calls.

4298

if (isTailCall && FPDiff)

4299

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

4300

Is64Bit, FPDiff, dl);

4301

4302

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

4303

SmallVector<SDValue, 8> MemOpChains;

4304

SDValue StackPtr;

4305

4306

// The next loop assumes that the locations are in the same order of the

4307

// input arguments.

4308

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4309, __extension__
__PRETTY_FUNCTION__))

4309

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4309, __extension__
__PRETTY_FUNCTION__));

4310

4311

// Walk the register/memloc assignments, inserting copies/loads. In the case

4312

// of tail call optimization arguments are handle later.

4313

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4314

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4315

++I, ++OutIndex) {

4316

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4316, __extension__
__PRETTY_FUNCTION__));

4317

// Skip inalloca/preallocated arguments, they have already been written.

4318

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4319

if (Flags.isInAlloca() || Flags.isPreallocated())

4320

continue;

4321

4322

CCValAssign &VA = ArgLocs[I];

4323

EVT RegVT = VA.getLocVT();

4324

SDValue Arg = OutVals[OutIndex];

4325

bool isByVal = Flags.isByVal();

4326

4327

// Promote the value if needed.

4328

switch (VA.getLocInfo()) {

4329

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4329);

4330

case CCValAssign::Full: break;

4331

case CCValAssign::SExt:

4332

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4333

break;

4334

case CCValAssign::ZExt:

4335

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4336

break;

4337

case CCValAssign::AExt:

4338

if (Arg.getValueType().isVector() &&

4339

Arg.getValueType().getVectorElementType() == MVT::i1)

4340

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4341

else if (RegVT.is128BitVector()) {

4342

// Special case: passing MMX values in XMM registers.

4343

Arg = DAG.getBitcast(MVT::i64, Arg);

4344

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4345

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4346

} else

4347

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4348

break;

4349

case CCValAssign::BCvt:

4350

Arg = DAG.getBitcast(RegVT, Arg);

4351

break;

4352

case CCValAssign::Indirect: {

4353

if (isByVal) {

4354

// Memcpy the argument to a temporary stack slot to prevent

4355

// the caller from seeing any modifications the callee may make

4356

// as guaranteed by the `byval` attribute.

4357

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4358

Flags.getByValSize(),

4359

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4360

SDValue StackSlot =

4361

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4362

Chain =

4363

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4364

// From now on treat this as a regular pointer

4365

Arg = StackSlot;

4366

isByVal = false;

4367

} else {

4368

// Store the argument.

4369

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4370

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4371

Chain = DAG.getStore(

4372

Chain, dl, Arg, SpillSlot,

4373

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4374

Arg = SpillSlot;

4375

}

4376

break;

4377

}

4378

}

4379

4380

if (VA.needsCustom()) {

4381

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4382, __extension__
__PRETTY_FUNCTION__))

4382

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4382, __extension__
__PRETTY_FUNCTION__));

4383

// Split v64i1 value into two registers

4384

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4385

} else if (VA.isRegLoc()) {

4386

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4387

const TargetOptions &Options = DAG.getTarget().Options;

4388

if (Options.EmitCallSiteInfo)

4389

CSInfo.emplace_back(VA.getLocReg(), I);

4390

if (isVarArg && IsWin64) {

4391

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4392

// shadow reg if callee is a varargs function.

4393

Register ShadowReg;

4394

switch (VA.getLocReg()) {

4395

case X86::XMM0: ShadowReg = X86::RCX; break;

4396

case X86::XMM1: ShadowReg = X86::RDX; break;

4397

case X86::XMM2: ShadowReg = X86::R8; break;

4398

case X86::XMM3: ShadowReg = X86::R9; break;

4399

}

4400

if (ShadowReg)

4401

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4402

}

4403

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4404

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4404, __extension__ __PRETTY_FUNCTION__));

4405

if (!StackPtr.getNode())

4406

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4407

getPointerTy(DAG.getDataLayout()));

4408

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4409

dl, DAG, VA, Flags, isByVal));

4410

}

4411

}

4412

4413

if (!MemOpChains.empty())

4414

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4415

4416

if (Subtarget.isPICStyleGOT()) {

4417

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4418

// GOT pointer (except regcall).

4419

if (!isTailCall) {

4420

// Indirect call with RegCall calling convertion may use up all the

4421

// general registers, so it is not suitable to bind EBX reister for

4422

// GOT address, just let register allocator handle it.

4423

if (CallConv != CallingConv::X86_RegCall)

4424

RegsToPass.push_back(std::make_pair(

4425

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4426

getPointerTy(DAG.getDataLayout()))));

4427

} else {

4428

// If we are tail calling and generating PIC/GOT style code load the

4429

// address of the callee into ECX. The value in ecx is used as target of

4430

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4431

// for tail calls on PIC/GOT architectures. Normally we would just put the

4432

// address of GOT into ebx and then call target@PLT. But for tail calls

4433

// ebx would be restored (since ebx is callee saved) before jumping to the

4434

// target@PLT.

4435

4436

// Note: The actual moving to ECX is done further down.

4437

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4438

if (G && !G->getGlobal()->hasLocalLinkage() &&

4439

G->getGlobal()->hasDefaultVisibility())

4440

Callee = LowerGlobalAddress(Callee, DAG);

4441

else if (isa<ExternalSymbolSDNode>(Callee))

4442

Callee = LowerExternalSymbol(Callee, DAG);

4443

}

4444

}

4445

4446

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&

4447

(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {

4448

// From AMD64 ABI document:

4449

// For calls that may call functions that use varargs or stdargs

4450

// (prototype-less calls or calls to functions containing ellipsis (...) in

4451

// the declaration) %al is used as hidden argument to specify the number

4452

// of SSE registers used. The contents of %al do not need to match exactly

4453

// the number of registers, but must be an ubound on the number of SSE

4454

// registers used and is in the range 0 - 8 inclusive.

4455

4456

// Count the number of XMM registers allocated.

4457

static const MCPhysReg XMMArgRegs[] = {

4458

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4459

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4460

};

4461

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4462

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4463, __extension__
__PRETTY_FUNCTION__))

4463

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4463, __extension__
__PRETTY_FUNCTION__));

4464

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4465

DAG.getConstant(NumXMMRegs, dl,

4466

MVT::i8)));

4467

}

4468

4469

if (isVarArg && IsMustTail) {

4470

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4471

for (const auto &F : Forwards) {

4472

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4473

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4474

}

4475

}

4476

4477

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4478

// don't need this because the eligibility check rejects calls that require

4479

// shuffling arguments passed in memory.

4480

if (!IsSibcall && isTailCall) {

4481

// Force all the incoming stack arguments to be loaded from the stack

4482

// before any new outgoing arguments are stored to the stack, because the

4483

// outgoing stack slots may alias the incoming argument stack slots, and

4484

// the alias isn't otherwise explicit. This is slightly more conservative

4485

// than necessary, because it means that each store effectively depends

4486

// on every argument instead of just those arguments it would clobber.

4487

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4488

4489

SmallVector<SDValue, 8> MemOpChains2;

4490

SDValue FIN;

4491

int FI = 0;

4492

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4493

++I, ++OutsIndex) {

4494

CCValAssign &VA = ArgLocs[I];

4495

4496

if (VA.isRegLoc()) {

4497

if (VA.needsCustom()) {

4498

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4499, __extension__
__PRETTY_FUNCTION__))

4499

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4499, __extension__
__PRETTY_FUNCTION__));

4500

// This means that we are in special case where one argument was

4501

// passed through two register locations - Skip the next location

4502

++I;

4503

}

4504

4505

continue;

4506

}

4507

4508

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4508, __extension__ __PRETTY_FUNCTION__));

4509

SDValue Arg = OutVals[OutsIndex];

4510

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4511

// Skip inalloca/preallocated arguments. They don't require any work.

4512

if (Flags.isInAlloca() || Flags.isPreallocated())

4513

continue;

4514

// Create frame index.

4515

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4516

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4517

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4518

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4519

4520

if (Flags.isByVal()) {

4521

// Copy relative to framepointer.

4522

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4523

if (!StackPtr.getNode())

4524

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4525

getPointerTy(DAG.getDataLayout()));

4526

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4527

StackPtr, Source);

4528

4529

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4530

ArgChain,

4531

Flags, DAG, dl));

4532

} else {

4533

// Store relative to framepointer.

4534

MemOpChains2.push_back(DAG.getStore(

4535

ArgChain, dl, Arg, FIN,

4536

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4537

}

4538

}

4539

4540

if (!MemOpChains2.empty())

4541

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4542

4543

// Store the return address to the appropriate stack slot.

4544

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4545

getPointerTy(DAG.getDataLayout()),

4546

RegInfo->getSlotSize(), FPDiff, dl);

4547

}

4548

4549

// Build a sequence of copy-to-reg nodes chained together with token chain

4550

// and flag operands which copy the outgoing args into registers.

4551

SDValue InFlag;

4552

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4553

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4554

RegsToPass[i].second, InFlag);

4555

InFlag = Chain.getValue(1);

4556

}

4557

4558

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4559

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4559, __extension__
__PRETTY_FUNCTION__));

4560

// In the 64-bit large code model, we have to make all calls

4561

// through a register, since the call instruction's 32-bit

4562

// pc-relative offset may not be large enough to hold the whole

4563

// address.

4564

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4565

Callee->getOpcode() == ISD::ExternalSymbol) {

4566

// Lower direct calls to global addresses and external symbols. Setting

4567

// ForCall to true here has the effect of removing WrapperRIP when possible

4568

// to allow direct calls to be selected without first materializing the

4569

// address into a register.

4570

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4571

} else if (Subtarget.isTarget64BitILP32() &&

4572

Callee.getValueType() == MVT::i32) {

4573

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4574

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4575

}

4576

4577

// Returns a chain & a flag for retval copy to use.

4578

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4579

SmallVector<SDValue, 8> Ops;

4580

4581

if (!IsSibcall && isTailCall && !IsMustTail) {

4582

Chain = DAG.getCALLSEQ_END(Chain,

4583

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

4584

DAG.getIntPtrConstant(0, dl, true), InFlag, dl);

4585

InFlag = Chain.getValue(1);

4586

}

4587

4588

Ops.push_back(Chain);

4589

Ops.push_back(Callee);

4590

4591

if (isTailCall)

4592

Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

4593

4594

// Add argument registers to the end of the list so that they are known live

4595

// into the call.

4596

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4597

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4598

RegsToPass[i].second.getValueType()));

4599

4600

// Add a register mask operand representing the call-preserved registers.

4601

const uint32_t *Mask = [&]() {

4602

auto AdaptedCC = CallConv;

4603

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),

4604

// use X86_INTR calling convention because it has the same CSR mask

4605

// (same preserved registers).

4606

if (HasNCSR)

4607

AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;

4608

// If NoCalleeSavedRegisters is requested, than use GHC since it happens

4609

// to use the CSR_NoRegs_RegMask.

4610

if (CB && CB->hasFnAttr("no_callee_saved_registers"))

4611

AdaptedCC = (CallingConv::ID)CallingConv::GHC;

4612

return RegInfo->getCallPreservedMask(MF, AdaptedCC);

4613

}();

4614

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4614, __extension__
__PRETTY_FUNCTION__));

4615

4616

// If this is an invoke in a 32-bit function using a funclet-based

4617

// personality, assume the function clobbers all registers. If an exception

4618

// is thrown, the runtime will not restore CSRs.

4619

// FIXME: Model this more precisely so that we can register allocate across

4620

// the normal edge and spill and fill across the exceptional edge.

4621

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4622

const Function &CallerFn = MF.getFunction();

4623

EHPersonality Pers =

4624

CallerFn.hasPersonalityFn()

4625

? classifyEHPersonality(CallerFn.getPersonalityFn())

4626

: EHPersonality::Unknown;

4627

if (isFuncletEHPersonality(Pers))

4628

Mask = RegInfo->getNoPreservedMask();

4629

}

4630

4631

// Define a new register mask from the existing mask.

4632

uint32_t *RegMask = nullptr;

4633

4634

// In some calling conventions we need to remove the used physical registers

4635

// from the reg mask.

4636

if (CallConv == CallingConv::X86_RegCall || HasNCSR) {

4637

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4638

4639

// Allocate a new Reg Mask and copy Mask.

4640

RegMask = MF.allocateRegMask();

4641

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4642

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4643

4644

// Make sure all sub registers of the argument registers are reset

4645

// in the RegMask.

4646

for (auto const &RegPair : RegsToPass)

4647

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

4648

SubRegs.isValid(); ++SubRegs)

4649

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

4650

4651

// Create the RegMask Operand according to our updated mask.

4652

Ops.push_back(DAG.getRegisterMask(RegMask));

4653

} else {

4654

// Create the RegMask Operand according to the static mask.

4655

Ops.push_back(DAG.getRegisterMask(Mask));

4656

}

4657

4658

if (InFlag.getNode())

4659

Ops.push_back(InFlag);

4660

4661

if (isTailCall) {

4662

// We used to do:

4663

//// If this is the first return lowered for this function, add the regs

4664

//// to the liveout set for the function.

4665

// This isn't right, although it's probably harmless on x86; liveouts

4666

// should be computed from returns not tail calls. Consider a void

4667

// function making a tail call to a function returning int.

4668

MF.getFrameInfo().setHasTailCall();

4669

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4670

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4671

return Ret;

4672

}

4673

4674

if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {

4675

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4676

} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

4677

// Calls with a "clang.arc.attachedcall" bundle are special. They should be

4678

// expanded to the call, directly followed by a special marker sequence and

4679

// a call to a ObjC library function. Use the CALL_RVMARKER to do that.

4680

assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4681, __extension__
__PRETTY_FUNCTION__))

4681

"tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4681, __extension__
__PRETTY_FUNCTION__));

4682

assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4682, __extension__
__PRETTY_FUNCTION__));

4683

4684

// Add a target global address for the retainRV/claimRV runtime function

4685

// just before the call target.

4686

Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

4687

auto PtrVT = getPointerTy(DAG.getDataLayout());

4688

auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);

4689

Ops.insert(Ops.begin() + 1, GA);

4690

Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);

4691

} else {

4692

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

4693

}

4694

4695

InFlag = Chain.getValue(1);

4696

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

4697

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

4698

4699

// Save heapallocsite metadata.

4700

if (CLI.CB)

4701

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

4702

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

4703

4704

// Create the CALLSEQ_END node.

4705

unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.

4706

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

4707

DAG.getTarget().Options.GuaranteedTailCallOpt))

4708

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

4709

else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)

4710

// If this call passes a struct-return pointer, the callee

4711

// pops that struct pointer.

4712

NumBytesForCalleeToPop = 4;

4713

4714

// Returns a flag for retval copy to use.

4715

if (!IsSibcall) {

4716

Chain = DAG.getCALLSEQ_END(Chain,

4717

DAG.getIntPtrConstant(NumBytesToPop, dl, true),

4718

DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,

4719

true),

4720

InFlag, dl);

4721

InFlag = Chain.getValue(1);

4722

}

4723

4724

// Handle result values, copying them out of physregs into vregs that we

4725

// return.

4726

return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,

4727

InVals, RegMask);

4728

}

4729

4730

//===----------------------------------------------------------------------===//

4731

// Fast Calling Convention (tail call) implementation

4732

//===----------------------------------------------------------------------===//

4733

4734

// Like std call, callee cleans arguments, convention except that ECX is

4735

// reserved for storing the tail called function address. Only 2 registers are

4736

// free for argument passing (inreg). Tail call optimization is performed

4737

// provided:

4738

// * tailcallopt is enabled

4739

// * caller/callee are fastcc

4740

// On X86_64 architecture with GOT-style position independent code only local

4741

// (within module) calls are supported at the moment.

4742

// To keep the stack aligned according to platform abi the function

4743

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

4744

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

4745

// If a tail called function callee has more arguments than the caller the

4746

// caller needs to make sure that there is room to move the RETADDR to. This is

4747

// achieved by reserving an area the size of the argument delta right after the

4748

// original RETADDR, but before the saved framepointer or the spilled registers

4749

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

4750

// stack layout:

4751

// arg1

4752

// arg2

4753

// RETADDR

4754

// [ new RETADDR

4755

// move area ]

4756

// (possible EBP)

4757

// ESI

4758

// EDI

4759

// local1 ..

4760

4761

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

4762

/// requirement.

4763

unsigned

4764

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

4765

SelectionDAG &DAG) const {

4766

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

4767

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

4768

assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4769, __extension__
__PRETTY_FUNCTION__))

4769

"StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4769, __extension__
__PRETTY_FUNCTION__));

4770

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

4771

}

4772

4773

/// Return true if the given stack call argument is already available in the

4774

/// same position (relatively) of the caller's incoming argument stack.

4775

static

4776

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

4777

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

4778

const X86InstrInfo *TII, const CCValAssign &VA) {

4779

unsigned Bytes = Arg.getValueSizeInBits() / 8;

4780

4781

for (;;) {

4782

// Look through nodes that don't alter the bits of the incoming value.

4783

unsigned Op = Arg.getOpcode();

4784

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

4785

Arg = Arg.getOperand(0);

4786

continue;

4787

}

4788

if (Op == ISD::TRUNCATE) {

4789

const SDValue &TruncInput = Arg.getOperand(0);

4790

if (TruncInput.getOpcode() == ISD::AssertZext &&

4791

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

4792

Arg.getValueType()) {

4793

Arg = TruncInput.getOperand(0);

4794

continue;

4795

}

4796

}

4797

break;

4798

}

4799

4800

int FI = INT_MAX2147483647;

4801

if (Arg.getOpcode() == ISD::CopyFromReg) {

4802

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

4803

if (!VR.isVirtual())

4804

return false;

4805

MachineInstr *Def = MRI->getVRegDef(VR);

4806

if (!Def)

4807

return false;

4808

if (!Flags.isByVal()) {

4809

if (!TII->isLoadFromStackSlot(*Def, FI))

4810

return false;

4811

} else {

4812

unsigned Opcode = Def->getOpcode();

4813

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

4814

Opcode == X86::LEA64_32r) &&

4815

Def->getOperand(1).isFI()) {

4816

FI = Def->getOperand(1).getIndex();

4817

Bytes = Flags.getByValSize();

4818

} else

4819

return false;

4820

}

4821

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

4822

if (Flags.isByVal())

4823

// ByVal argument is passed in as a pointer but it's now being

4824

// dereferenced. e.g.

4825

// define @foo(%struct.X* %A) {

4826

// tail call @bar(%struct.X* byval %A)

4827

// }

4828

return false;

4829

SDValue Ptr = Ld->getBasePtr();

4830

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

4831

if (!FINode)

4832

return false;

4833

FI = FINode->getIndex();

4834

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

4835

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

4836

FI = FINode->getIndex();

4837

Bytes = Flags.getByValSize();

4838

} else

4839

return false;

4840

4841

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4841, __extension__ __PRETTY_FUNCTION__));

4842

if (!MFI.isFixedObjectIndex(FI))

4843

return false;

4844

4845

if (Offset != MFI.getObjectOffset(FI))

4846

return false;

4847

4848

// If this is not byval, check that the argument stack object is immutable.

4849

// inalloca and argument copy elision can create mutable argument stack

4850

// objects. Byval objects can be mutated, but a byval call intends to pass the

4851

// mutated memory.

4852

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

4853

return false;

4854

4855

if (VA.getLocVT().getFixedSizeInBits() >

4856

Arg.getValueSizeInBits().getFixedSize()) {

4857

// If the argument location is wider than the argument type, check that any

4858

// extension flags match.

4859

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

4860

Flags.isSExt() != MFI.isObjectSExt(FI)) {

4861

return false;

4862

}

4863

}

4864

4865

return Bytes == MFI.getObjectSize(FI);

4866

}

4867

4868

/// Check whether the call is eligible for tail call optimization. Targets

4869

/// that want to do tail call optimization should implement this function.

4870

bool X86TargetLowering::IsEligibleForTailCallOptimization(

4871

SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,

4872

bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,

4873

const SmallVectorImpl<SDValue> &OutVals,

4874

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

4875

if (!mayTailCallThisCC(CalleeCC))

4876

return false;

4877

4878

// If -tailcallopt is specified, make fastcc functions tail-callable.

4879

MachineFunction &MF = DAG.getMachineFunction();

4880

const Function &CallerF = MF.getFunction();

4881

4882

// If the function return type is x86_fp80 and the callee return type is not,

4883

// then the FP_EXTEND of the call result is not a nop. It's not safe to

4884

// perform a tailcall optimization here.

4885

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

4886

return false;

4887

4888

CallingConv::ID CallerCC = CallerF.getCallingConv();

4889

bool CCMatch = CallerCC == CalleeCC;

4890

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

4891

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

4892

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

4893

CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

4894

4895

// Win64 functions have extra shadow space for argument homing. Don't do the

4896

// sibcall if the caller and callee have mismatched expectations for this

4897

// space.

4898

if (IsCalleeWin64 != IsCallerWin64)

4899

return false;

4900

4901

if (IsGuaranteeTCO) {

4902

if (canGuaranteeTCO(CalleeCC) && CCMatch)

4903

return true;

4904

return false;

4905

}

4906

4907

// Look for obvious safe cases to perform tail call optimization that do not

4908

// require ABI changes. This is what gcc calls sibcall.

4909

4910

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

4911

// emit a special epilogue.

4912

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4913

if (RegInfo->hasStackRealignment(MF))

4914

return false;

4915

4916

// Also avoid sibcall optimization if we're an sret return fn and the callee

4917

// is incompatible. See comment in LowerReturn about why hasStructRetAttr is

4918

// insufficient.

4919

if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {

4920

// For a compatible tail call the callee must return our sret pointer. So it

4921

// needs to be (a) an sret function itself and (b) we pass our sret as its

4922

// sret. Condition #b is harder to determine.

4923

return false;

4924

} else if (IsCalleePopSRet)

4925

// The callee pops an sret, so we cannot tail-call, as our caller doesn't

4926

// expect that.

4927

return false;

4928

4929

// Do not sibcall optimize vararg calls unless all arguments are passed via

4930

// registers.

4931

LLVMContext &C = *DAG.getContext();

4932

if (isVarArg && !Outs.empty()) {

4933

// Optimizing for varargs on Win64 is unlikely to be safe without

4934

// additional testing.

4935

if (IsCalleeWin64 || IsCallerWin64)

4936

return false;

4937

4938

SmallVector<CCValAssign, 16> ArgLocs;

4939

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

4940

4941

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

4942

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

4943

if (!ArgLocs[i].isRegLoc())

4944

return false;

4945

}

4946

4947

// If the call result is in ST0 / ST1, it needs to be popped off the x87

4948

// stack. Therefore, if it's not used by the call it is not safe to optimize

4949

// this into a sibcall.

4950

bool Unused = false;

4951

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

4952

if (!Ins[i].Used) {

4953

Unused = true;

4954

break;

4955

}

4956

}

4957

if (Unused) {

4958

SmallVector<CCValAssign, 16> RVLocs;

4959

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

4960

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

4961

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

4962

CCValAssign &VA = RVLocs[i];

4963

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

4964

return false;

4965

}

4966

}

4967

4968

// Check that the call results are passed in the same way.

4969

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

4970

RetCC_X86, RetCC_X86))

4971

return false;

4972

// The callee has to preserve all registers the caller needs to preserve.

4973

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

4974

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

4975

if (!CCMatch) {

4976

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

4977

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

4978

return false;

4979

}

4980

4981

unsigned StackArgsSize = 0;

4982

4983

// If the callee takes no arguments then go on to check the results of the

4984

// call.

4985

if (!Outs.empty()) {

4986

// Check if stack adjustment is needed. For now, do not do this if any

4987

// argument is passed on the stack.

4988

SmallVector<CCValAssign, 16> ArgLocs;

4989

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

4990

4991

// Allocate shadow area for Win64

4992

if (IsCalleeWin64)

4993

CCInfo.AllocateStack(32, Align(8));

4994

4995

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

4996

StackArgsSize = CCInfo.getNextStackOffset();

4997

4998

if (CCInfo.getNextStackOffset()) {

4999

// Check if the arguments are already laid out in the right way as

5000

// the caller's fixed stack objects.

5001

MachineFrameInfo &MFI = MF.getFrameInfo();

5002

const MachineRegisterInfo *MRI = &MF.getRegInfo();

5003

const X86InstrInfo *TII = Subtarget.getInstrInfo();

5004

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5005

CCValAssign &VA = ArgLocs[i];

5006

SDValue Arg = OutVals[i];

5007

ISD::ArgFlagsTy Flags = Outs[i].Flags;

5008

if (VA.getLocInfo() == CCValAssign::Indirect)

5009

return false;

5010

if (!VA.isRegLoc()) {

5011

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

5012

MFI, MRI, TII, VA))

5013

return false;

5014

}

5015

}

5016

}

5017

5018

bool PositionIndependent = isPositionIndependent();

5019

// If the tailcall address may be in a register, then make sure it's

5020

// possible to register allocate for it. In 32-bit, the call address can

5021

// only target EAX, EDX, or ECX since the tail call must be scheduled after

5022

// callee-saved registers are restored. These happen to be the same

5023

// registers used to pass 'inreg' arguments so watch out for those.

5024

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

5025

!isa<ExternalSymbolSDNode>(Callee)) ||

5026

PositionIndependent)) {

5027

unsigned NumInRegs = 0;

5028

// In PIC we need an extra register to formulate the address computation

5029

// for the callee.

5030

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

5031

5032

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5033

CCValAssign &VA = ArgLocs[i];

5034

if (!VA.isRegLoc())

5035

continue;

5036

Register Reg = VA.getLocReg();

5037

switch (Reg) {

5038

default: break;

5039

case X86::EAX: case X86::EDX: case X86::ECX:

5040

if (++NumInRegs == MaxInRegs)

5041

return false;

5042

break;

5043

}

5044

}

5045

}

5046

5047

const MachineRegisterInfo &MRI = MF.getRegInfo();

5048

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

5049

return false;

5050

}

5051

5052

bool CalleeWillPop =

5053

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

5054

MF.getTarget().Options.GuaranteedTailCallOpt);

5055

5056

if (unsigned BytesToPop =

5057

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

5058

// If we have bytes to pop, the callee must pop them.

5059

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

5060

if (!CalleePopMatches)

5061

return false;

5062

} else if (CalleeWillPop && StackArgsSize > 0) {

5063

// If we don't have bytes to pop, make sure the callee doesn't pop any.

5064

return false;

5065

}

5066

5067

return true;

5068

}

5069

5070

FastISel *

5071

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

5072

const TargetLibraryInfo *libInfo) const {

5073

return X86::createFastISel(funcInfo, libInfo);

5074

}

5075

5076

//===----------------------------------------------------------------------===//

5077

// Other Lowering Hooks

5078

//===----------------------------------------------------------------------===//

5079

5080

bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

5081

bool AssumeSingleUse) {

5082

if (!AssumeSingleUse && !Op.hasOneUse())

5083

return false;

5084

if (!ISD::isNormalLoad(Op.getNode()))

5085

return false;

5086

5087

// If this is an unaligned vector, make sure the target supports folding it.

5088

auto *Ld = cast<LoadSDNode>(Op.getNode());

5089

if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

5090

Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)

5091

return false;

5092

5093

// TODO: If this is a non-temporal load and the target has an instruction

5094

// for it, it should not be folded. See "useNonTemporalLoad()".

5095

5096

return true;

5097

}

5098

5099

bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

5100

const X86Subtarget &Subtarget,

5101

bool AssumeSingleUse) {

5102

assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5102, __extension__
__PRETTY_FUNCTION__));

5103

if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

5104

return false;

5105

5106

// We can not replace a wide volatile load with a broadcast-from-memory,

5107

// because that would narrow the load, which isn't legal for volatiles.

5108

auto *Ld = cast<LoadSDNode>(Op.getNode());

5109

return !Ld->isVolatile() ||

5110

Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

5111

}

5112

5113

bool X86::mayFoldIntoStore(SDValue Op) {

5114

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

5115

}

5116

5117

bool X86::mayFoldIntoZeroExtend(SDValue Op) {

5118

if (Op.hasOneUse()) {

5119

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

5120

return (ISD::ZERO_EXTEND == Opcode);

5121

}

5122

return false;

5123

}

5124

5125

static bool isTargetShuffle(unsigned Opcode) {

5126

switch(Opcode) {

5127

default: return false;

5128

case X86ISD::BLENDI:

5129

case X86ISD::PSHUFB:

5130

case X86ISD::PSHUFD:

5131

case X86ISD::PSHUFHW:

5132

case X86ISD::PSHUFLW:

5133

case X86ISD::SHUFP:

5134

case X86ISD::INSERTPS:

5135

case X86ISD::EXTRQI:

5136

case X86ISD::INSERTQI:

5137

case X86ISD::VALIGN:

5138

case X86ISD::PALIGNR:

5139

case X86ISD::VSHLDQ:

5140

case X86ISD::VSRLDQ:

5141

case X86ISD::MOVLHPS:

5142

case X86ISD::MOVHLPS:

5143

case X86ISD::MOVSHDUP:

5144

case X86ISD::MOVSLDUP:

5145

case X86ISD::MOVDDUP:

5146

case X86ISD::MOVSS:

5147

case X86ISD::MOVSD:

5148

case X86ISD::MOVSH:

5149

case X86ISD::UNPCKL:

5150

case X86ISD::UNPCKH:

5151

case X86ISD::VBROADCAST:

5152

case X86ISD::VPERMILPI:

5153

case X86ISD::VPERMILPV:

5154

case X86ISD::VPERM2X128:

5155

case X86ISD::SHUF128:

5156

case X86ISD::VPERMIL2:

5157

case X86ISD::VPERMI:

5158

case X86ISD::VPPERM:

5159

case X86ISD::VPERMV:

5160

case X86ISD::VPERMV3:

5161

case X86ISD::VZEXT_MOVL:

5162

return true;

5163

}

5164

}

5165

5166

static bool isTargetShuffleVariableMask(unsigned Opcode) {

5167

switch (Opcode) {

5168

default: return false;

5169

// Target Shuffles.

5170

case X86ISD::PSHUFB:

5171

case X86ISD::VPERMILPV:

5172

case X86ISD::VPERMIL2:

5173

case X86ISD::VPPERM:

5174

case X86ISD::VPERMV:

5175

case X86ISD::VPERMV3:

5176

return true;

5177

// 'Faux' Target Shuffles.

5178

case ISD::OR:

5179

case ISD::AND:

5180

case X86ISD::ANDNP:

5181

return true;

5182

}

5183

}

5184

5185

static bool isTargetShuffleSplat(SDValue Op) {

5186

unsigned Opcode = Op.getOpcode();

5187

if (Opcode == ISD::EXTRACT_SUBVECTOR)

5188

return isTargetShuffleSplat(Op.getOperand(0));

5189

return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;

5190

}

5191

5192

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

5193

MachineFunction &MF = DAG.getMachineFunction();

5194

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5195

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

5196

int ReturnAddrIndex = FuncInfo->getRAIndex();

5197

5198

if (ReturnAddrIndex == 0) {

5199

// Set up a frame object for the return address.

5200

unsigned SlotSize = RegInfo->getSlotSize();

5201

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

5202

-(int64_t)SlotSize,

5203

false);

5204

FuncInfo->setRAIndex(ReturnAddrIndex);

5205

}

5206

5207

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

5208

}

5209

5210

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

5211

bool hasSymbolicDisplacement) {

5212

// Offset should fit into 32 bit immediate field.

5213

if (!isInt<32>(Offset))

5214

return false;

5215

5216

// If we don't have a symbolic displacement - we don't have any extra

5217

// restrictions.

5218

if (!hasSymbolicDisplacement)

5219

return true;

5220

5221

// FIXME: Some tweaks might be needed for medium code model.

5222

if (M != CodeModel::Small && M != CodeModel::Kernel)

5223

return false;

5224

5225

// For small code model we assume that latest object is 16MB before end of 31

5226

// bits boundary. We may also accept pretty large negative constants knowing

5227

// that all objects are in the positive half of address space.

5228

if (M == CodeModel::Small && Offset < 16*1024*1024)

5229

return true;

5230

5231

// For kernel code model we know that all object resist in the negative half

5232

// of 32bits address space. We may not accept negative offsets, since they may

5233

// be just off and we may accept pretty large positive ones.

5234

if (M == CodeModel::Kernel && Offset >= 0)

5235

return true;

5236

5237

return false;

5238

}

5239

5240

/// Determines whether the callee is required to pop its own arguments.

5241

/// Callee pop is necessary to support tail calls.

5242

bool X86::isCalleePop(CallingConv::ID CallingConv,

5243

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

5244

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

5245

// can guarantee TCO.

5246

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

5247

return true;

5248

5249

switch (CallingConv) {

5250

default:

5251

return false;

5252

case CallingConv::X86_StdCall:

5253

case CallingConv::X86_FastCall:

5254

case CallingConv::X86_ThisCall:

5255

case CallingConv::X86_VectorCall:

5256

return !is64Bit;

5257

}

5258

}

5259

5260

/// Return true if the condition is an signed comparison operation.

5261

static bool isX86CCSigned(unsigned X86CC) {

5262

switch (X86CC) {

5263

default:

5264

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5264);

5265

case X86::COND_E:

5266

case X86::COND_NE:

5267

case X86::COND_B:

5268

case X86::COND_A:

5269

case X86::COND_BE:

5270

case X86::COND_AE:

5271

return false;

5272

case X86::COND_G:

5273

case X86::COND_GE:

5274

case X86::COND_L:

5275

case X86::COND_LE:

5276

return true;

5277

}

5278

}

5279

5280

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

5281

switch (SetCCOpcode) {

5282

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5282);

5283

case ISD::SETEQ: return X86::COND_E;

5284

case ISD::SETGT: return X86::COND_G;

5285

case ISD::SETGE: return X86::COND_GE;

5286

case ISD::SETLT: return X86::COND_L;

5287

case ISD::SETLE: return X86::COND_LE;

5288

case ISD::SETNE: return X86::COND_NE;

5289

case ISD::SETULT: return X86::COND_B;

5290

case ISD::SETUGT: return X86::COND_A;

5291

case ISD::SETULE: return X86::COND_BE;

5292

case ISD::SETUGE: return X86::COND_AE;

5293

}

5294

}

5295

5296

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

5297

/// condition code, returning the condition code and the LHS/RHS of the

5298

/// comparison to make.

5299

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

5300

bool isFP, SDValue &LHS, SDValue &RHS,

5301

SelectionDAG &DAG) {

5302

if (!isFP) {

5303

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

5304

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

5305

// X > -1 -> X == 0, jump !sign.

5306

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5307

return X86::COND_NS;

5308

}

5309

if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

5310

// X < 0 -> X == 0, jump on sign.

5311

return X86::COND_S;

5312

}

5313

if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

5314

// X >= 0 -> X == 0, jump on !sign.

5315

return X86::COND_NS;

5316

}

5317

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

5318

// X < 1 -> X <= 0

5319

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5320

return X86::COND_LE;

5321

}

5322

}

5323

5324

return TranslateIntegerX86CC(SetCCOpcode);

5325

}

5326

5327

// First determine if it is required or is profitable to flip the operands.

5328

5329

// If LHS is a foldable load, but RHS is not, flip the condition.

5330

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

5331

!ISD::isNON_EXTLoad(RHS.getNode())) {

5332

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

5333

std::swap(LHS, RHS);

5334

}

5335

5336

switch (SetCCOpcode) {

5337

default: break;

5338

case ISD::SETOLT:

5339

case ISD::SETOLE:

5340

case ISD::SETUGT:

5341

case ISD::SETUGE:

5342

std::swap(LHS, RHS);

5343

break;

5344

}

5345

5346

// On a floating point condition, the flags are set as follows:

5347

// ZF PF CF op

5348

// 0 | 0 | 0 | X > Y

5349

// 0 | 0 | 1 | X < Y

5350

// 1 | 0 | 0 | X == Y

5351

// 1 | 1 | 1 | unordered

5352

switch (SetCCOpcode) {

5353

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5353);

5354

case ISD::SETUEQ:

5355

case ISD::SETEQ: return X86::COND_E;

5356

case ISD::SETOLT: // flipped

5357

case ISD::SETOGT:

5358

case ISD::SETGT: return X86::COND_A;

5359

case ISD::SETOLE: // flipped

5360

case ISD::SETOGE:

5361

case ISD::SETGE: return X86::COND_AE;

5362

case ISD::SETUGT: // flipped

5363

case ISD::SETULT:

5364

case ISD::SETLT: return X86::COND_B;

5365

case ISD::SETUGE: // flipped

5366

case ISD::SETULE:

5367

case ISD::SETLE: return X86::COND_BE;

5368

case ISD::SETONE:

5369

case ISD::SETNE: return X86::COND_NE;

5370

case ISD::SETUO: return X86::COND_P;

5371

case ISD::SETO: return X86::COND_NP;

5372

case ISD::SETOEQ:

5373

case ISD::SETUNE: return X86::COND_INVALID;

5374

}

5375

}

5376

5377

/// Is there a floating point cmov for the specific X86 condition code?

5378

/// Current x86 isa includes the following FP cmov instructions:

5379

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5380

static bool hasFPCMov(unsigned X86CC) {

5381

switch (X86CC) {

5382

default:

5383

return false;

5384

case X86::COND_B:

5385

case X86::COND_BE:

5386

case X86::COND_E:

5387

case X86::COND_P:

5388

case X86::COND_A:

5389

case X86::COND_AE:

5390

case X86::COND_NE:

5391

case X86::COND_NP:

5392

return true;

5393

}

5394

}

5395

5396

static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

5397

return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

5398

VT.is512BitVector();

5399

}

5400

5401

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5402

const CallInst &I,

5403

MachineFunction &MF,

5404

unsigned Intrinsic) const {

5405

Info.flags = MachineMemOperand::MONone;

5406

Info.offset = 0;

5407

5408

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5409

if (!IntrData) {

5410

switch (Intrinsic) {

5411

case Intrinsic::x86_aesenc128kl:

5412

case Intrinsic::x86_aesdec128kl:

5413

Info.opc = ISD::INTRINSIC_W_CHAIN;

5414

Info.ptrVal = I.getArgOperand(1);

5415

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5416

Info.align = Align(1);

5417

Info.flags |= MachineMemOperand::MOLoad;

5418

return true;

5419

case Intrinsic::x86_aesenc256kl:

5420

case Intrinsic::x86_aesdec256kl:

5421

Info.opc = ISD::INTRINSIC_W_CHAIN;

5422

Info.ptrVal = I.getArgOperand(1);

5423

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5424

Info.align = Align(1);

5425

Info.flags |= MachineMemOperand::MOLoad;

5426

return true;

5427

case Intrinsic::x86_aesencwide128kl:

5428

case Intrinsic::x86_aesdecwide128kl:

5429

Info.opc = ISD::INTRINSIC_W_CHAIN;

5430

Info.ptrVal = I.getArgOperand(0);

5431

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5432

Info.align = Align(1);

5433

Info.flags |= MachineMemOperand::MOLoad;

5434

return true;

5435

case Intrinsic::x86_aesencwide256kl:

5436

case Intrinsic::x86_aesdecwide256kl:

5437

Info.opc = ISD::INTRINSIC_W_CHAIN;

5438

Info.ptrVal = I.getArgOperand(0);

5439

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5440

Info.align = Align(1);

5441

Info.flags |= MachineMemOperand::MOLoad;

5442

return true;

5443

case Intrinsic::x86_atomic_bts:

5444

case Intrinsic::x86_atomic_btc:

5445

case Intrinsic::x86_atomic_btr: {

5446

Info.opc = ISD::INTRINSIC_W_CHAIN;

5447

Info.ptrVal = I.getArgOperand(0);

5448

unsigned Size = I.getType()->getScalarSizeInBits();

5449

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5450

Info.align = Align(Size);

5451

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5452

MachineMemOperand::MOVolatile;

5453

return true;

5454

}

5455

}

5456

return false;

5457

}

5458

5459

switch (IntrData->Type) {

5460

case TRUNCATE_TO_MEM_VI8:

5461

case TRUNCATE_TO_MEM_VI16:

5462

case TRUNCATE_TO_MEM_VI32: {

5463

Info.opc = ISD::INTRINSIC_VOID;

5464

Info.ptrVal = I.getArgOperand(0);

5465

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5466

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5467

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5468

ScalarVT = MVT::i8;

5469

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5470

ScalarVT = MVT::i16;

5471

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5472

ScalarVT = MVT::i32;

5473

5474

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5475

Info.align = Align(1);

5476

Info.flags |= MachineMemOperand::MOStore;

5477

break;

5478

}

5479

case GATHER:

5480

case GATHER_AVX2: {

5481

Info.opc = ISD::INTRINSIC_W_CHAIN;

5482

Info.ptrVal = nullptr;

5483

MVT DataVT = MVT::getVT(I.getType());

5484

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5485

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5486

IndexVT.getVectorNumElements());

5487

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5488

Info.align = Align(1);

5489

Info.flags |= MachineMemOperand::MOLoad;

5490

break;

5491

}

5492

case SCATTER: {

5493

Info.opc = ISD::INTRINSIC_VOID;

5494

Info.ptrVal = nullptr;

5495

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5496

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5497

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5498

IndexVT.getVectorNumElements());

5499

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5500

Info.align = Align(1);

5501

Info.flags |= MachineMemOperand::MOStore;

5502

break;

5503

}

5504

default:

5505

return false;

5506

}

5507

5508

return true;

5509

}

5510

5511

/// Returns true if the target can instruction select the

5512

/// specified FP immediate natively. If false, the legalizer will

5513

/// materialize the FP immediate as a load from a constant pool.

5514

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5515

bool ForCodeSize) const {

5516

for (const APFloat &FPImm : LegalFPImmediates)

5517

if (Imm.bitwiseIsEqual(FPImm))

5518

return true;

5519

return false;

5520

}

5521

5522

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5523

ISD::LoadExtType ExtTy,

5524

EVT NewVT) const {

5525

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5525, __extension__
__PRETTY_FUNCTION__));

5526

5527

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5528

// relocation target a movq or addq instruction: don't let the load shrink.

5529

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5530

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5531

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5532

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5533

5534

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5535

// those uses are extracted directly into a store, then the extract + store

5536

// can be store-folded. Therefore, it's probably not worth splitting the load.

5537

EVT VT = Load->getValueType(0);

5538

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5539

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5540

// Skip uses of the chain value. Result 0 of the node is the load value.

5541

if (UI.getUse().getResNo() != 0)

5542

continue;

5543

5544

// If this use is not an extract + store, it's probably worth splitting.

5545

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5546

UI->use_begin()->getOpcode() != ISD::STORE)

5547

return true;

5548

}

5549

// All non-chain uses are extract + store.

5550

return false;

5551

}

5552

5553

return true;

5554

}

5555

5556

/// Returns true if it is beneficial to convert a load of a constant

5557

/// to just the constant itself.

5558

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5559

Type *Ty) const {

5560

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5560, __extension__ __PRETTY_FUNCTION__));

5561

5562

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5563

if (BitSize == 0 || BitSize > 64)

5564

return false;

5565

return true;

5566

}

5567

5568

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5569

// If we are using XMM registers in the ABI and the condition of the select is

5570

// a floating-point compare and we have blendv or conditional move, then it is

5571

// cheaper to select instead of doing a cross-register move and creating a

5572

// load that depends on the compare result.

5573

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5574

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5575

}

5576

5577

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5578

// TODO: It might be a win to ease or lift this restriction, but the generic

5579

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5580

if (VT.isVector() && Subtarget.hasAVX512())

5581

return false;

5582

5583

return true;

5584

}

5585

5586

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5587

SDValue C) const {

5588

// TODO: We handle scalars using custom code, but generic combining could make

5589

// that unnecessary.

5590

APInt MulC;

5591

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5592

return false;

5593

5594

// Find the type this will be legalized too. Otherwise we might prematurely

5595

// convert this to shl+add/sub and then still have to type legalize those ops.

5596

// Another choice would be to defer the decision for illegal types until

5597

// after type legalization. But constant splat vectors of i64 can't make it

5598

// through type legalization on 32-bit targets so we would need to special

5599

// case vXi64.

5600

while (getTypeAction(Context, VT) != TypeLegal)

5601

VT = getTypeToTransformTo(Context, VT);

5602

5603

// If vector multiply is legal, assume that's faster than shl + add/sub.

5604

// Multiply is a complex op with higher latency and lower throughput in

5605

// most implementations, sub-vXi32 vector multiplies are always fast,

5606

// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

5607

// is always going to be slow.

5608

unsigned EltSizeInBits = VT.getScalarSizeInBits();

5609

if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

5610

(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

5611

return false;

5612

5613

// shl+add, shl+sub, shl+add+neg

5614

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5615

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5616

}

5617

5618

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5619

unsigned Index) const {

5620

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5621

return false;

5622

5623

// Mask vectors support all subregister combinations and operations that

5624

// extract half of vector.

5625

if (ResVT.getVectorElementType() == MVT::i1)

5626

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5627

(Index == ResVT.getVectorNumElements()));

5628

5629

return (Index % ResVT.getVectorNumElements()) == 0;

5630

}

5631

5632

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5633

unsigned Opc = VecOp.getOpcode();

5634

5635

// Assume target opcodes can't be scalarized.

5636

// TODO - do we have any exceptions?

5637

if (Opc >= ISD::BUILTIN_OP_END)

5638

return false;

5639

5640

// If the vector op is not supported, try to convert to scalar.

5641

EVT VecVT = VecOp.getValueType();

5642

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5643

return true;

5644

5645

// If the vector op is supported, but the scalar op is not, the transform may

5646

// not be worthwhile.

5647

EVT ScalarVT = VecVT.getScalarType();

5648

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5649

}

5650

5651

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5652

bool) const {

5653

// TODO: Allow vectors?

5654

if (VT.isVector())

5655

return false;

5656

return VT.isSimple() || !isOperationExpand(Opcode, VT);

5657

}

5658

5659

bool X86TargetLowering::isCheapToSpeculateCttz() const {

5660

// Speculate cttz only if we can directly use TZCNT.

5661

return Subtarget.hasBMI();

5662

}

5663

5664

bool X86TargetLowering::isCheapToSpeculateCtlz() const {

5665

// Speculate ctlz only if we can directly use LZCNT.

5666

return Subtarget.hasLZCNT();

5667

}

5668

5669

bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {

5670

return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||

5671

(VT == MVT::f16 && Subtarget.hasFP16());

5672

}

5673

5674

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

5675

// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

5676

// expensive than a straight movsd. On the other hand, it's important to

5677

// shrink long double fp constant since fldt is very slow.

5678

return !Subtarget.hasSSE2() || VT == MVT::f80;

5679

}

5680

5681

bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

5682

return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

5683

(VT == MVT::f32 && Subtarget.hasSSE1()) ||

5684

(VT == MVT::f16 && Subtarget.hasFP16());

5685

}

5686

5687

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

5688

const SelectionDAG &DAG,

5689

const MachineMemOperand &MMO) const {

5690

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

5691

BitcastVT.getVectorElementType() == MVT::i1)

5692

return false;

5693

5694

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

5695

return false;

5696

5697

// If both types are legal vectors, it's always ok to convert them.

5698

if (LoadVT.isVector() && BitcastVT.isVector() &&

5699

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

5700

return true;

5701

5702

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

5703

}

5704

5705

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

5706

const MachineFunction &MF) const {

5707

// Do not merge to float value size (128 bytes) if no implicit

5708

// float attribute is set.

5709

bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

5710

5711

if (NoFloat) {

5712

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

5713

return (MemVT.getSizeInBits() <= MaxIntSize);

5714

}

5715

// Make sure we don't merge greater than our preferred vector

5716

// width.

5717

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

5718

return false;

5719

5720

return true;

5721

}

5722

5723

bool X86TargetLowering::isCtlzFast() const {

5724

return Subtarget.hasFastLZCNT();

5725

}

5726

5727

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

5728

const Instruction &AndI) const {

5729

return true;

5730

}

5731

5732

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

5733

EVT VT = Y.getValueType();

5734

5735

if (VT.isVector())

5736

return false;

5737

5738

if (!Subtarget.hasBMI())

5739

return false;

5740

5741

// There are only 32-bit and 64-bit forms for 'andn'.

5742

if (VT != MVT::i32 && VT != MVT::i64)

5743

return false;

5744

5745

return !isa<ConstantSDNode>(Y);

5746

}

5747

5748

bool X86TargetLowering::hasAndNot(SDValue Y) const {

5749

EVT VT = Y.getValueType();

5750

5751

if (!VT.isVector())

5752

return hasAndNotCompare(Y);

5753

5754

// Vector.

5755

5756

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

5757

return false;

5758

5759

if (VT == MVT::v4i32)

5760

return true;

5761

5762

return Subtarget.hasSSE2();

5763

}

5764

5765

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

5766

return X.getValueType().isScalarInteger(); // 'bt'

5767

}

5768

5769

bool X86TargetLowering::

5770

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

5771

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

5772

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

5773

SelectionDAG &DAG) const {

5774

// Does baseline recommend not to perform the fold by default?

5775

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

5776

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

5777

return false;

5778

// For scalars this transform is always beneficial.

5779

if (X.getValueType().isScalarInteger())

5780

return true;

5781

// If all the shift amounts are identical, then transform is beneficial even

5782

// with rudimentary SSE2 shifts.

5783

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

5784

return true;

5785

// If we have AVX2 with it's powerful shift operations, then it's also good.

5786

if (Subtarget.hasAVX2())

5787

return true;

5788

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

5789

return NewShiftOpcode == ISD::SHL;

5790

}

5791

5792

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

5793

const SDNode *N, CombineLevel Level) const {

5794

assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))

5795

N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))

5796

(N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))

5797

N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__))

5798

"Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5798, __extension__
__PRETTY_FUNCTION__));

5799

EVT VT = N->getValueType(0);

5800

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

5801

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

5802

// Only fold if the shift values are equal - so it folds to AND.

5803

// TODO - we should fold if either is a non-uniform vector but we don't do

5804

// the fold for non-splats yet.

5805

return N->getOperand(1) == N->getOperand(0).getOperand(1);

5806

}

5807

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

5808

}

5809

5810

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

5811

EVT VT = Y.getValueType();

5812

5813

// For vectors, we don't have a preference, but we probably want a mask.

5814

if (VT.isVector())

5815

return false;

5816

5817

// 64-bit shifts on 32-bit targets produce really bad bloated code.

5818

if (VT == MVT::i64 && !Subtarget.is64Bit())

5819

return false;

5820

5821

return true;

5822

}

5823

5824

bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,

5825

SDNode *N) const {

5826

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

5827

!Subtarget.isOSWindows())

5828

return false;

5829

return true;

5830

}

5831

5832

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

5833

// Any legal vector type can be splatted more efficiently than

5834

// loading/spilling from memory.

5835

return isTypeLegal(VT);

5836

}

5837

5838

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

5839

MVT VT = MVT::getIntegerVT(NumBits);

5840

if (isTypeLegal(VT))

5841

return VT;

5842

5843

// PMOVMSKB can handle this.

5844

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

5845

return MVT::v16i8;

5846

5847

// VPMOVMSKB can handle this.

5848

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

5849

return MVT::v32i8;

5850

5851

// TODO: Allow 64-bit type for 32-bit target.

5852

// TODO: 512-bit types should be allowed, but make sure that those

5853

// cases are handled in combineVectorSizedSetCCEquality().

5854

5855

return MVT::INVALID_SIMPLE_VALUE_TYPE;

5856

}

5857

5858

/// Val is the undef sentinel value or equal to the specified value.

5859

static bool isUndefOrEqual(int Val, int CmpVal) {

5860

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

5861

}

5862

5863

/// Return true if every element in Mask is the undef sentinel value or equal to

5864

/// the specified value..

5865

static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

5866

return llvm::all_of(Mask, [CmpVal](int M) {

5867

return (M == SM_SentinelUndef) || (M == CmpVal);

5868

});

5869

}

5870

5871

/// Val is either the undef or zero sentinel value.

5872

static bool isUndefOrZero(int Val) {

5873

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

5874

}

5875

5876

/// Return true if every element in Mask, beginning from position Pos and ending

5877

/// in Pos+Size is the undef sentinel value.

5878

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

5879

return llvm::all_of(Mask.slice(Pos, Size),

5880

[](int M) { return M == SM_SentinelUndef; });

5881

}

5882

5883

/// Return true if the mask creates a vector whose lower half is undefined.

5884

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

5885

unsigned NumElts = Mask.size();

5886

return isUndefInRange(Mask, 0, NumElts / 2);

5887

}

5888

5889

/// Return true if the mask creates a vector whose upper half is undefined.

5890

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

5891

unsigned NumElts = Mask.size();

5892

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

5893

}

5894

5895

/// Return true if Val falls within the specified range (L, H].

5896

static bool isInRange(int Val, int Low, int Hi) {

5897

return (Val >= Low && Val < Hi);

5898

}

5899

5900

/// Return true if the value of any element in Mask falls within the specified

5901

/// range (L, H].

5902

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

5903

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

5904

}

5905

5906

/// Return true if the value of any element in Mask is the zero sentinel value.

5907

static bool isAnyZero(ArrayRef<int> Mask) {

5908

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

5909

}

5910

5911

/// Return true if the value of any element in Mask is the zero or undef

5912

/// sentinel values.

5913

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

5914

return llvm::any_of(Mask, [](int M) {

5915

return M == SM_SentinelZero || M == SM_SentinelUndef;

5916

});

5917

}

5918

5919

/// Return true if Val is undef or if its value falls within the

5920

/// specified range (L, H].

5921

static bool isUndefOrInRange(int Val, int Low, int Hi) {

5922

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

5923

}

5924

5925

/// Return true if every element in Mask is undef or if its value

5926

/// falls within the specified range (L, H].

5927

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

5928

return llvm::all_of(

5929

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

5930

}

5931

5932

/// Return true if Val is undef, zero or if its value falls within the

5933

/// specified range (L, H].

5934

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

5935

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

5936

}

5937

5938

/// Return true if every element in Mask is undef, zero or if its value

5939

/// falls within the specified range (L, H].

5940

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

5941

return llvm::all_of(

5942

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

5943

}

5944

5945

/// Return true if every element in Mask, beginning

5946

/// from position Pos and ending in Pos + Size, falls within the specified

5947

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

5948

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

5949

unsigned Size, int Low, int Step = 1) {

5950

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

5951

if (!isUndefOrEqual(Mask[i], Low))

5952

return false;

5953

return true;

5954

}

5955

5956

/// Return true if every element in Mask, beginning

5957

/// from position Pos and ending in Pos+Size, falls within the specified

5958

/// sequential range (Low, Low+Size], or is undef or is zero.

5959

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

5960

unsigned Size, int Low,

5961

int Step = 1) {

5962

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

5963

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

5964

return false;

5965

return true;

5966

}

5967

5968

/// Return true if every element in Mask, beginning

5969

/// from position Pos and ending in Pos+Size is undef or is zero.

5970

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

5971

unsigned Size) {

5972

return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

5973

}

5974

5975

/// Helper function to test whether a shuffle mask could be

5976

/// simplified by widening the elements being shuffled.

5977

///

5978

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

5979

/// leaves it in an unspecified state.

5980

///

5981

/// NOTE: This must handle normal vector shuffle masks and *target* vector

5982

/// shuffle masks. The latter have the special property of a '-2' representing

5983

/// a zero-ed lane of a vector.

5984

static bool canWidenShuffleElements(ArrayRef<int> Mask,

5985

SmallVectorImpl<int> &WidenedMask) {

5986

WidenedMask.assign(Mask.size() / 2, 0);

5987

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

5988

int M0 = Mask[i];

5989

int M1 = Mask[i + 1];

5990

5991

// If both elements are undef, its trivial.

5992

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

5993

WidenedMask[i / 2] = SM_SentinelUndef;

5994

continue;

5995

}

5996

5997

// Check for an undef mask and a mask value properly aligned to fit with

5998

// a pair of values. If we find such a case, use the non-undef mask's value.

5999

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

6000

WidenedMask[i / 2] = M1 / 2;

6001

continue;

6002

}

6003

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

6004

WidenedMask[i / 2] = M0 / 2;

6005

continue;

6006

}

6007

6008

// When zeroing, we need to spread the zeroing across both lanes to widen.

6009

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

6010

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

6011

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

6012

WidenedMask[i / 2] = SM_SentinelZero;

6013

continue;

6014

}

6015

return false;

6016

}

6017

6018

// Finally check if the two mask values are adjacent and aligned with

6019

// a pair.

6020

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

6021

WidenedMask[i / 2] = M0 / 2;

6022

continue;

6023

}

6024

6025

// Otherwise we can't safely widen the elements used in this shuffle.

6026

return false;

6027

}

6028

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6029, __extension__
__PRETTY_FUNCTION__))

6029

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6029, __extension__
__PRETTY_FUNCTION__));

6030

6031

return true;

6032

}

6033

6034

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6035

const APInt &Zeroable,

6036

bool V2IsZero,

6037

SmallVectorImpl<int> &WidenedMask) {

6038

// Create an alternative mask with info about zeroable elements.

6039

// Here we do not set undef elements as zeroable.

6040

SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());

6041

if (V2IsZero) {

6042

assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6042, __extension__
__PRETTY_FUNCTION__));

6043

for (int i = 0, Size = Mask.size(); i != Size; ++i)

6044

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

6045

ZeroableMask[i] = SM_SentinelZero;

6046

}

6047

return canWidenShuffleElements(ZeroableMask, WidenedMask);

6048

}

6049

6050

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

6051

SmallVector<int, 32> WidenedMask;

6052

return canWidenShuffleElements(Mask, WidenedMask);

6053

}

6054

6055

// Attempt to narrow/widen shuffle mask until it matches the target number of

6056

// elements.

6057

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

6058

SmallVectorImpl<int> &ScaledMask) {

6059

unsigned NumSrcElts = Mask.size();

6060

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6061, __extension__
__PRETTY_FUNCTION__))

6061

"Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6061, __extension__
__PRETTY_FUNCTION__));

6062

6063

// Narrowing is guaranteed to work.

6064

if (NumDstElts >= NumSrcElts) {

6065

int Scale = NumDstElts / NumSrcElts;

6066

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

6067

return true;

6068

}

6069

6070

// We have to repeat the widening until we reach the target size, but we can

6071

// split out the first widening as it sets up ScaledMask for us.

6072

if (canWidenShuffleElements(Mask, ScaledMask)) {

6073

while (ScaledMask.size() > NumDstElts) {

6074

SmallVector<int, 16> WidenedMask;

6075

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

6076

return false;

6077

ScaledMask = std::move(WidenedMask);

6078

}

6079

return true;

6080

}

6081

6082

return false;

6083

}

6084

6085

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

6086

bool X86::isZeroNode(SDValue Elt) {

6087

return isNullConstant(Elt) || isNullFPConstant(Elt);

6088

}

6089

6090

// Build a vector of constants.

6091

// Use an UNDEF node if MaskElt == -1.

6092

// Split 64-bit constants in the 32-bit mode.

6093

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

6094

const SDLoc &dl, bool IsMask = false) {

6095

6096

SmallVector<SDValue, 32> Ops;

6097

bool Split = false;

6098

6099

MVT ConstVecVT = VT;

6100

unsigned NumElts = VT.getVectorNumElements();

6101

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6102

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6103

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6104

Split = true;

6105

}

6106

6107

MVT EltVT = ConstVecVT.getVectorElementType();

6108

for (unsigned i = 0; i < NumElts; ++i) {

6109

bool IsUndef = Values[i] < 0 && IsMask;

6110

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

6111

DAG.getConstant(Values[i], dl, EltVT);

6112

Ops.push_back(OpNode);

6113

if (Split)

6114

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

6115

DAG.getConstant(0, dl, EltVT));

6116

}

6117

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6118

if (Split)

6119

ConstsNode = DAG.getBitcast(VT, ConstsNode);

6120

return ConstsNode;

6121

}

6122

6123

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

6124

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6125

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6126, __extension__
__PRETTY_FUNCTION__))

6126

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6126, __extension__
__PRETTY_FUNCTION__));

6127

SmallVector<SDValue, 32> Ops;

6128

bool Split = false;

6129

6130

MVT ConstVecVT = VT;

6131

unsigned NumElts = VT.getVectorNumElements();

6132

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6133

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6134

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6135

Split = true;

6136

}

6137

6138

MVT EltVT = ConstVecVT.getVectorElementType();

6139

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

6140

if (Undefs[i]) {

6141

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

6142

continue;

6143

}

6144

const APInt &V = Bits[i];

6145

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6145, __extension__
__PRETTY_FUNCTION__));

6146

if (Split) {

6147

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

6148

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

6149

} else if (EltVT == MVT::f32) {

6150

APFloat FV(APFloat::IEEEsingle(), V);

6151

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6152

} else if (EltVT == MVT::f64) {

6153

APFloat FV(APFloat::IEEEdouble(), V);

6154

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6155

} else {

6156

Ops.push_back(DAG.getConstant(V, dl, EltVT));

6157

}

6158

}

6159

6160

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6161

return DAG.getBitcast(VT, ConstsNode);

6162

}

6163

6164

/// Returns a vector of specified type with all zero elements.

6165

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

6166

SelectionDAG &DAG, const SDLoc &dl) {

6167

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))

6168

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__))

6169

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6169, __extension__
__PRETTY_FUNCTION__));

6170

6171

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

6172

// type. This ensures they get CSE'd. But if the integer type is not

6173

// available, use a floating-point +0.0 instead.

6174

SDValue Vec;

6175

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

6176

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

6177

} else if (VT.isFloatingPoint()) {

6178

Vec = DAG.getConstantFP(+0.0, dl, VT);

6179

} else if (VT.getVectorElementType() == MVT::i1) {

6180

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6181, __extension__
__PRETTY_FUNCTION__))

6181

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6181, __extension__
__PRETTY_FUNCTION__));

6182

Vec = DAG.getConstant(0, dl, VT);

6183

} else {

6184

unsigned Num32BitElts = VT.getSizeInBits() / 32;

6185

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

6186

}

6187

return DAG.getBitcast(VT, Vec);

6188

}

6189

6190

// Helper to determine if the ops are all the extracted subvectors come from a

6191

// single source. If we allow commute they don't have to be in order (Lo/Hi).

6192

static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

6193

if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6194

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6195

LHS.getValueType() != RHS.getValueType() ||

6196

LHS.getOperand(0) != RHS.getOperand(0))

6197

return SDValue();

6198

6199

SDValue Src = LHS.getOperand(0);

6200

if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

6201

return SDValue();

6202

6203

unsigned NumElts = LHS.getValueType().getVectorNumElements();

6204

if ((LHS.getConstantOperandAPInt(1) == 0 &&

6205

RHS.getConstantOperandAPInt(1) == NumElts) ||

6206

(AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

6207

LHS.getConstantOperandAPInt(1) == NumElts))

6208

return Src;

6209

6210

return SDValue();

6211

}

6212

6213

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

6214

const SDLoc &dl, unsigned vectorWidth) {

6215

EVT VT = Vec.getValueType();

6216

EVT ElVT = VT.getVectorElementType();

6217

unsigned Factor = VT.getSizeInBits() / vectorWidth;

6218

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

6219

VT.getVectorNumElements() / Factor);

6220

6221

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

6222

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

6223

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6223, __extension__
__PRETTY_FUNCTION__));

6224

6225

// This is the index of the first element of the vectorWidth-bit chunk

6226

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6227

IdxVal &= ~(ElemsPerChunk - 1);

6228

6229

// If the input is a buildvector just emit a smaller one.

6230

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

6231

return DAG.getBuildVector(ResultVT, dl,

6232

Vec->ops().slice(IdxVal, ElemsPerChunk));

6233

6234

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6235

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

6236

}

6237

6238

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

6239

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

6240

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

6241

/// instructions or a simple subregister reference. Idx is an index in the

6242

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

6243

/// lowering EXTRACT_VECTOR_ELT operations easier.

6244

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

6245

SelectionDAG &DAG, const SDLoc &dl) {

6246

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6247, __extension__
__PRETTY_FUNCTION__))

6247

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6247, __extension__
__PRETTY_FUNCTION__));

6248

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

6249

}

6250

6251

/// Generate a DAG to grab 256-bits from a 512-bit vector.

6252

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

6253

SelectionDAG &DAG, const SDLoc &dl) {

6254

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6254, __extension__
__PRETTY_FUNCTION__));

6255

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

6256

}

6257

6258

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6259

SelectionDAG &DAG, const SDLoc &dl,

6260

unsigned vectorWidth) {

6261

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6262, __extension__
__PRETTY_FUNCTION__))

6262

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6262, __extension__
__PRETTY_FUNCTION__));

6263

// Inserting UNDEF is Result

6264

if (Vec.isUndef())

6265

return Result;

6266

EVT VT = Vec.getValueType();

6267

EVT ElVT = VT.getVectorElementType();

6268

EVT ResultVT = Result.getValueType();

6269

6270

// Insert the relevant vectorWidth bits.

6271

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

6272

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6272, __extension__
__PRETTY_FUNCTION__));

6273

6274

// This is the index of the first element of the vectorWidth-bit chunk

6275

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6276

IdxVal &= ~(ElemsPerChunk - 1);

6277

6278

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6279

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

6280

}

6281

6282

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

6283

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

6284

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

6285

/// simple superregister reference. Idx is an index in the 128 bits

6286

/// we want. It need not be aligned to a 128-bit boundary. That makes

6287

/// lowering INSERT_VECTOR_ELT operations easier.

6288

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6289

SelectionDAG &DAG, const SDLoc &dl) {

6290

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6290, __extension__
__PRETTY_FUNCTION__));

6291

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

6292

}

6293

6294

/// Widen a vector to a larger size with the same scalar type, with the new

6295

/// elements either zero or undef.

6296

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

6297

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6298

const SDLoc &dl) {

6299

assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))

6300

Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__))

6301

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6301, __extension__
__PRETTY_FUNCTION__));

6302

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

6303

: DAG.getUNDEF(VT);

6304

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

6305

DAG.getIntPtrConstant(0, dl));

6306

}

6307

6308

/// Widen a vector to a larger size with the same scalar type, with the new

6309

/// elements either zero or undef.

6310

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

6311

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6312

const SDLoc &dl, unsigned WideSizeInBits) {

6313

assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))

6314

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__))

6315

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6315, __extension__
__PRETTY_FUNCTION__));

6316

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

6317

MVT SVT = Vec.getSimpleValueType().getScalarType();

6318

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

6319

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

6320

}

6321

6322

// Helper function to collect subvector ops that are concatenated together,

6323

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

6324

// The subvectors in Ops are guaranteed to be the same type.

6325

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {

6326

assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6326, __extension__
__PRETTY_FUNCTION__));

6327

6328

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

6329

Ops.append(N->op_begin(), N->op_end());

6330

return true;

6331

}

6332

6333

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

6334

SDValue Src = N->getOperand(0);

6335

SDValue Sub = N->getOperand(1);

6336

const APInt &Idx = N->getConstantOperandAPInt(2);

6337

EVT VT = Src.getValueType();

6338

EVT SubVT = Sub.getValueType();

6339

6340

// TODO - Handle more general insert_subvector chains.

6341

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&

6342

Idx == (VT.getVectorNumElements() / 2)) {

6343

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

6344

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

6345

Src.getOperand(1).getValueType() == SubVT &&

6346

isNullConstant(Src.getOperand(2))) {

6347

Ops.push_back(Src.getOperand(1));

6348

Ops.push_back(Sub);

6349

return true;

6350

}

6351

// insert_subvector(x, extract_subvector(x, lo), hi)

6352

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6353

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

6354

Ops.append(2, Sub);

6355

return true;

6356

}

6357

}

6358

}

6359

6360

return false;

6361

}

6362

6363

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

6364

const SDLoc &dl) {

6365

EVT VT = Op.getValueType();

6366

unsigned NumElems = VT.getVectorNumElements();

6367

unsigned SizeInBits = VT.getSizeInBits();

6368

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6369, __extension__
__PRETTY_FUNCTION__))

6369

"Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6369, __extension__
__PRETTY_FUNCTION__));

6370

6371

// If this is a splat value (with no-undefs) then use the lower subvector,

6372

// which should be a free extraction.

6373

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

6374

if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

6375

return std::make_pair(Lo, Lo);

6376

6377

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

6378

return std::make_pair(Lo, Hi);

6379

}

6380

6381

/// Break an operation into 2 half sized ops and then concatenate the results.

6382

static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {

6383

unsigned NumOps = Op.getNumOperands();

6384

EVT VT = Op.getValueType();

6385

SDLoc dl(Op);

6386

6387

// Extract the LHS Lo/Hi vectors

6388

SmallVector<SDValue> LoOps(NumOps, SDValue());

6389

SmallVector<SDValue> HiOps(NumOps, SDValue());

6390

for (unsigned I = 0; I != NumOps; ++I) {

6391

SDValue SrcOp = Op.getOperand(I);

6392

if (!SrcOp.getValueType().isVector()) {

6393

LoOps[I] = HiOps[I] = SrcOp;

6394

continue;

6395

}

6396

std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

6397

}

6398

6399

EVT LoVT, HiVT;

6400

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

6401

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

6402

DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

6403

DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

6404

}

6405

6406

/// Break an unary integer operation into 2 half sized ops and then

6407

/// concatenate the result back.

6408

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

6409

// Make sure we only try to split 256/512-bit types to avoid creating

6410

// narrow vectors.

6411

EVT VT = Op.getValueType();

6412

(void)VT;

6413

assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))

6414

Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__))

6415

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6415, __extension__
__PRETTY_FUNCTION__));

6416

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))

6417

VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__))

6418

"Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6418, __extension__
__PRETTY_FUNCTION__));

6419

return splitVectorOp(Op, DAG);

6420

}

6421

6422

/// Break a binary integer operation into 2 half sized ops and then

6423

/// concatenate the result back.

6424

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

6425

// Assert that all the types match.

6426

EVT VT = Op.getValueType();

6427

(void)VT;

6428

assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6429, __extension__
__PRETTY_FUNCTION__))

6429

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6429, __extension__
__PRETTY_FUNCTION__));

6430

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6430, __extension__
__PRETTY_FUNCTION__));

6431

return splitVectorOp(Op, DAG);

6432

}

6433

6434

// Helper for splitting operands of an operation to legal target size and

6435

// apply a function on each part.

6436

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

6437

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

6438

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

6439

// The argument Builder is a function that will be applied on each split part:

6440

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

6441

template <typename F>

6442

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6443

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

6444

F Builder, bool CheckBWI = true) {

6445

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6445, __extension__
__PRETTY_FUNCTION__));

6446

unsigned NumSubs = 1;

6447

if ((CheckBWI && Subtarget.useBWIRegs()) ||

6448

(!CheckBWI && Subtarget.useAVX512Regs())) {

6449

if (VT.getSizeInBits() > 512) {

6450

NumSubs = VT.getSizeInBits() / 512;

6451

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6451, __extension__
__PRETTY_FUNCTION__));

6452

}

6453

} else if (Subtarget.hasAVX2()) {

6454

if (VT.getSizeInBits() > 256) {

6455

NumSubs = VT.getSizeInBits() / 256;

6456

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6456, __extension__
__PRETTY_FUNCTION__));

6457

}

6458

} else {

6459

if (VT.getSizeInBits() > 128) {

6460

NumSubs = VT.getSizeInBits() / 128;

6461

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6461, __extension__
__PRETTY_FUNCTION__));

6462

}

6463

}

6464

6465

if (NumSubs == 1)

6466

return Builder(DAG, DL, Ops);

6467

6468

SmallVector<SDValue, 4> Subs;

6469

for (unsigned i = 0; i != NumSubs; ++i) {

6470

SmallVector<SDValue, 2> SubOps;

6471

for (SDValue Op : Ops) {

6472

EVT OpVT = Op.getValueType();

6473

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

6474

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

6475

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

6476

}

6477

Subs.push_back(Builder(DAG, DL, SubOps));

6478

}

6479

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

6480

}

6481

6482

// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

6483

// targets.

6484

static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

6485

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

6486

const X86Subtarget &Subtarget) {

6487

assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6487, __extension__
__PRETTY_FUNCTION__));

6488

MVT SVT = VT.getScalarType();

6489

6490

// If we have a 32/64 splatted constant, splat it to DstTy to

6491

// encourage a foldable broadcast'd operand.

6492

auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

6493

unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

6494

// AVX512 broadcasts 32/64-bit operands.

6495

// TODO: Support float once getAVX512Node is used by fp-ops.

6496

if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

6497

!DAG.getTargetLoweringInfo().isTypeLegal(SVT))

6498

return SDValue();

6499

// If we're not widening, don't bother if we're not bitcasting.

6500

if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

6501

return SDValue();

6502

if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

6503

APInt SplatValue, SplatUndef;

6504

unsigned SplatBitSize;

6505

bool HasAnyUndefs;

6506

if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

6507

HasAnyUndefs, OpEltSizeInBits) &&

6508

!HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

6509

return DAG.getConstant(SplatValue, DL, DstVT);

6510

}

6511

return SDValue();

6512

};

6513

6514

bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());

6515

6516

MVT DstVT = VT;

6517

if (Widen)

6518

DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());

6519

6520

// Canonicalize src operands.

6521

SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());

6522

for (SDValue &Op : SrcOps) {

6523

MVT OpVT = Op.getSimpleValueType();

6524

// Just pass through scalar operands.

6525

if (!OpVT.isVector())

6526

continue;

6527

assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6527, __extension__
__PRETTY_FUNCTION__));

6528

6529

if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

6530

Op = BroadcastOp;

6531

continue;

6532

}

6533

6534

// Just widen the subvector by inserting into an undef wide vector.

6535

if (Widen)

6536

Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

6537

}

6538

6539

SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);

6540

6541

// Perform the 512-bit op then extract the bottom subvector.

6542

if (Widen)

6543

Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

6544

return Res;

6545

}

6546

6547

/// Insert i1-subvector to i1-vector.

6548

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6549

const X86Subtarget &Subtarget) {

6550

6551

SDLoc dl(Op);

6552

SDValue Vec = Op.getOperand(0);

6553

SDValue SubVec = Op.getOperand(1);

6554

SDValue Idx = Op.getOperand(2);

6555

unsigned IdxVal = Op.getConstantOperandVal(2);

6556

6557

// Inserting undef is a nop. We can just return the original vector.

6558

if (SubVec.isUndef())

6559

return Vec;

6560

6561

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6562

return Op;

6563

6564

MVT OpVT = Op.getSimpleValueType();

6565

unsigned NumElems = OpVT.getVectorNumElements();

6566

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6567

6568

// Extend to natively supported kshift.

6569

MVT WideOpVT = OpVT;

6570

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6571

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6572

6573

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6574

// if necessary.

6575

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6576

// May need to promote to a legal type.

6577

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6578

DAG.getConstant(0, dl, WideOpVT),

6579

SubVec, Idx);

6580

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6581

}

6582

6583

MVT SubVecVT = SubVec.getSimpleValueType();

6584

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6585

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))

6586

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))

6587

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__));

6588

6589

SDValue Undef = DAG.getUNDEF(WideOpVT);

6590

6591

if (IdxVal == 0) {

6592

// Zero lower bits of the Vec

6593

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6594

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6595

ZeroIdx);

6596

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6597

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6598

// Merge them together, SubVec should be zero extended.

6599

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6600

DAG.getConstant(0, dl, WideOpVT),

6601

SubVec, ZeroIdx);

6602

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6603

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6604

}

6605

6606

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6607

Undef, SubVec, ZeroIdx);

6608

6609

if (Vec.isUndef()) {

6610

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6610, __extension__
__PRETTY_FUNCTION__));

6611

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6612

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6613

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6614

}

6615

6616

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6617

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6617, __extension__
__PRETTY_FUNCTION__));

6618

// If upper elements of Vec are known undef, then just shift into place.

6619

if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

6620

[](SDValue V) { return V.isUndef(); })) {

6621

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6622

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6623

} else {

6624

NumElems = WideOpVT.getVectorNumElements();

6625

unsigned ShiftLeft = NumElems - SubVecNumElems;

6626

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6627

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6628

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6629

if (ShiftRight != 0)

6630

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6631

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6632

}

6633

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6634

}

6635

6636

// Simple case when we put subvector in the upper part

6637

if (IdxVal + SubVecNumElems == NumElems) {

6638

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6639

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6640

if (SubVecNumElems * 2 == NumElems) {

6641

// Special case, use legal zero extending insert_subvector. This allows

6642

// isel to optimize when bits are known zero.

6643

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

6644

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6645

DAG.getConstant(0, dl, WideOpVT),

6646

Vec, ZeroIdx);

6647

} else {

6648

// Otherwise use explicit shifts to zero the bits.

6649

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6650

Undef, Vec, ZeroIdx);

6651

NumElems = WideOpVT.getVectorNumElements();

6652

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

6653

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6654

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6655

}

6656

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6657

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6658

}

6659

6660

// Inserting into the middle is more complicated.

6661

6662

NumElems = WideOpVT.getVectorNumElements();

6663

6664

// Widen the vector if needed.

6665

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

6666

6667

unsigned ShiftLeft = NumElems - SubVecNumElems;

6668

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6669

6670

// Do an optimization for the the most frequently used types.

6671

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

6672

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

6673

Mask0.flipAllBits();

6674

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

6675

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

6676

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

6677

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6678

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6679

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6680

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6681

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6682

6683

// Reduce to original width if needed.

6684

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6685

}

6686

6687

// Clear the upper bits of the subvector and move it to its insert position.

6688

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6689

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6690

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6691

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6692

6693

// Isolate the bits below the insertion point.

6694

unsigned LowShift = NumElems - IdxVal;

6695

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

6696

DAG.getTargetConstant(LowShift, dl, MVT::i8));

6697

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

6698

DAG.getTargetConstant(LowShift, dl, MVT::i8));

6699

6700

// Isolate the bits after the last inserted bit.

6701

unsigned HighShift = IdxVal + SubVecNumElems;

6702

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

6703

DAG.getTargetConstant(HighShift, dl, MVT::i8));

6704

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

6705

DAG.getTargetConstant(HighShift, dl, MVT::i8));

6706

6707

// Now OR all 3 pieces together.

6708

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

6709

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

6710

6711

// Reduce to original width if needed.

6712

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6713

}

6714

6715

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

6716

const SDLoc &dl) {

6717

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6717, __extension__
__PRETTY_FUNCTION__));

6718

EVT SubVT = V1.getValueType();

6719

EVT SubSVT = SubVT.getScalarType();

6720

unsigned SubNumElts = SubVT.getVectorNumElements();

6721

unsigned SubVectorWidth = SubVT.getSizeInBits();

6722

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

6723

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

6724

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

6725

}

6726

6727

/// Returns a vector of specified type with all bits set.

6728

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

6729

/// Then bitcast to their original type, ensuring they get CSE'd.

6730

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6731

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))

6732

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__));

6733

6734

APInt Ones = APInt::getAllOnes(32);

6735

unsigned NumElts = VT.getSizeInBits() / 32;

6736

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

6737

return DAG.getBitcast(VT, Vec);

6738

}

6739

6740

// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.

6741

static unsigned getOpcode_EXTEND(unsigned Opcode) {

6742

switch (Opcode) {

6743

case ISD::ANY_EXTEND:

6744

case ISD::ANY_EXTEND_VECTOR_INREG:

6745

return ISD::ANY_EXTEND;

6746

case ISD::ZERO_EXTEND:

6747

case ISD::ZERO_EXTEND_VECTOR_INREG:

6748

return ISD::ZERO_EXTEND;

6749

case ISD::SIGN_EXTEND:

6750

case ISD::SIGN_EXTEND_VECTOR_INREG:

6751

return ISD::SIGN_EXTEND;

6752

}

6753

llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6753);

6754

}

6755

6756

// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.

6757

static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {

6758

switch (Opcode) {

6759

case ISD::ANY_EXTEND:

6760

case ISD::ANY_EXTEND_VECTOR_INREG:

6761

return ISD::ANY_EXTEND_VECTOR_INREG;

6762

case ISD::ZERO_EXTEND:

6763

case ISD::ZERO_EXTEND_VECTOR_INREG:

6764

return ISD::ZERO_EXTEND_VECTOR_INREG;

6765

case ISD::SIGN_EXTEND:

6766

case ISD::SIGN_EXTEND_VECTOR_INREG:

6767

return ISD::SIGN_EXTEND_VECTOR_INREG;

6768

}

6769

llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6769);

6770

}

6771

6772

static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

6773

SDValue In, SelectionDAG &DAG) {

6774

EVT InVT = In.getValueType();

6775

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6775, __extension__
__PRETTY_FUNCTION__));

6776

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))

6777

ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__))

6778

"Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6778, __extension__
__PRETTY_FUNCTION__));

6779

6780

// For 256-bit vectors, we only need the lower (128-bit) input half.

6781

// For 512-bit vectors, we only need the lower input half or quarter.

6782

if (InVT.getSizeInBits() > 128) {

6783

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6784, __extension__
__PRETTY_FUNCTION__))

6784

"Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6784, __extension__
__PRETTY_FUNCTION__));

6785

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

6786

In = extractSubVector(In, 0, DAG, DL,

6787

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

6788

InVT = In.getValueType();

6789

}

6790

6791

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

6792

Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

6793

6794

return DAG.getNode(Opcode, DL, VT, In);

6795

}

6796

6797

// Match (xor X, -1) -> X.

6798

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

6799

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

6800

static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

6801

V = peekThroughBitcasts(V);

6802

if (V.getOpcode() == ISD::XOR &&

6803

ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))

6804

return V.getOperand(0);

6805

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6806

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

6807

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

6808

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

6809

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

6810

Not, V.getOperand(1));

6811

}

6812

}

6813

SmallVector<SDValue, 2> CatOps;

6814

if (collectConcatOps(V.getNode(), CatOps)) {

6815

for (SDValue &CatOp : CatOps) {

6816

SDValue NotCat = IsNOT(CatOp, DAG);

6817

if (!NotCat) return SDValue();

6818

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

6819

}

6820

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

6821

}

6822

return SDValue();

6823

}

6824

6825

void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

6826

bool Lo, bool Unary) {

6827

assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6828, __extension__
__PRETTY_FUNCTION__))

6828

"Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6828, __extension__
__PRETTY_FUNCTION__));

6829

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__
__PRETTY_FUNCTION__));

6830

int NumElts = VT.getVectorNumElements();

6831

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

6832

for (int i = 0; i < NumElts; ++i) {

6833

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

6834

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

6835

Pos += (Unary ? 0 : NumElts * (i % 2));

6836

Pos += (Lo ? 0 : NumEltsInLane / 2);

6837

Mask.push_back(Pos);

6838

}

6839

}

6840

6841

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

6842

/// imposed by AVX and specific to the unary pattern. Example:

6843

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

6844

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

6845

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

6846

bool Lo) {

6847

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6847, __extension__
__PRETTY_FUNCTION__));

6848

int NumElts = VT.getVectorNumElements();

6849

for (int i = 0; i < NumElts; ++i) {

6850

int Pos = i / 2;

6851

Pos += (Lo ? 0 : NumElts / 2);

6852

Mask.push_back(Pos);

6853

}

6854

}

6855

6856

// Attempt to constant fold, else just create a VECTOR_SHUFFLE.

6857

static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

6858

SDValue V1, SDValue V2, ArrayRef<int> Mask) {

6859

if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

6860

(ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

6861

SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

6862

for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

6863

int M = Mask[I];

6864

if (M < 0)

6865

continue;

6866

SDValue V = (M < NumElts) ? V1 : V2;

6867

if (V.isUndef())

6868

continue;

6869

Ops[I] = V.getOperand(M % NumElts);

6870

}

6871

return DAG.getBuildVector(VT, dl, Ops);

6872

}

6873

6874

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

6875

}

6876

6877

/// Returns a vector_shuffle node for an unpackl operation.

6878

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

6879

SDValue V1, SDValue V2) {

6880

SmallVector<int, 8> Mask;

6881

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

6882

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

6883

}

6884

6885

/// Returns a vector_shuffle node for an unpackh operation.

6886

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

6887

SDValue V1, SDValue V2) {

6888

SmallVector<int, 8> Mask;

6889

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

6890

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

6891

}

6892

6893

/// Returns a node that packs the LHS + RHS nodes together at half width.

6894

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

6895

/// TODO: Add subvector splitting if/when we have a need for it.

6896

static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6897

const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

6898

bool PackHiHalf = false) {

6899

MVT OpVT = LHS.getSimpleValueType();

6900

unsigned EltSizeInBits = VT.getScalarSizeInBits();

6901

bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

6902

assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))

6903

VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))

6904

(EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__))

6905

"Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6905, __extension__
__PRETTY_FUNCTION__));

6906

assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6907, __extension__
__PRETTY_FUNCTION__))

6907

"Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6907, __extension__
__PRETTY_FUNCTION__));

6908

6909

// Rely on vector shuffles for vXi64 -> vXi32 packing.

6910

if (EltSizeInBits == 32) {

6911

SmallVector<int> PackMask;

6912

int Offset = PackHiHalf ? 1 : 0;

6913

int NumElts = VT.getVectorNumElements();

6914

for (int I = 0; I != NumElts; I += 4) {

6915

PackMask.push_back(I + Offset);

6916

PackMask.push_back(I + Offset + 2);

6917

PackMask.push_back(I + Offset + NumElts);

6918

PackMask.push_back(I + Offset + NumElts + 2);

6919

}

6920

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

6921

DAG.getBitcast(VT, RHS), PackMask);

6922

}

6923

6924

// See if we already have sufficient leading bits for PACKSS/PACKUS.

6925

if (!PackHiHalf) {

6926

if (UsePackUS &&

6927

DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

6928

DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

6929

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

6930

6931

if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

6932

DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

6933

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

6934

}

6935

6936

// Fallback to sign/zero extending the requested half and pack.

6937

SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

6938

if (UsePackUS) {

6939

if (PackHiHalf) {

6940

LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

6941

RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

6942

} else {

6943

SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

6944

LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

6945

RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

6946

};

6947

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

6948

};

6949

6950

if (!PackHiHalf) {

6951

LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

6952

RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

6953

}

6954

LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

6955

RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

6956

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

6957

}

6958

6959

/// Return a vector_shuffle of the specified vector of zero or undef vector.

6960

/// This produces a shuffle where the low element of V2 is swizzled into the

6961

/// zero/undef vector, landing at element Idx.

6962

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

6963

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

6964

bool IsZero,

6965

const X86Subtarget &Subtarget,

6966

SelectionDAG &DAG) {

6967

MVT VT = V2.getSimpleValueType();

6968

SDValue V1 = IsZero

6969

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

6970

int NumElems = VT.getVectorNumElements();

6971

SmallVector<int, 16> MaskVec(NumElems);

6972

for (int i = 0; i != NumElems; ++i)

6973

// If this is the insertion idx, put the low elt of V2 here.

6974

MaskVec[i] = (i == Idx) ? NumElems : i;

6975

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

6976

}

6977

6978

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

6979

if (Ptr.getOpcode() == X86ISD::Wrapper ||

6980

Ptr.getOpcode() == X86ISD::WrapperRIP)

6981

Ptr = Ptr.getOperand(0);

6982

6983

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

6984

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

6985

return nullptr;

6986

6987

return CNode->getConstVal();

6988

}

6989

6990

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

6991

if (!Load || !ISD::isNormalLoad(Load))

6992

return nullptr;

6993

return getTargetConstantFromBasePtr(Load->getBasePtr());

6994

}

6995

6996

static const Constant *getTargetConstantFromNode(SDValue Op) {

6997

Op = peekThroughBitcasts(Op);

6998

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

6999

}

7000

7001

const Constant *

7002

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

7003

assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7003, __extension__
__PRETTY_FUNCTION__));

7004

return getTargetConstantFromNode(LD);

7005

}

7006

7007

// Extract raw constant bits from constant pools.

7008

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

7009

APInt &UndefElts,

7010

SmallVectorImpl<APInt> &EltBits,

7011

bool AllowWholeUndefs = true,

7012

bool AllowPartialUndefs = true) {

7013

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7013, __extension__
__PRETTY_FUNCTION__));

7014

7015

Op = peekThroughBitcasts(Op);

7016

7017

EVT VT = Op.getValueType();

7018

unsigned SizeInBits = VT.getSizeInBits();

7019

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7019, __extension__
__PRETTY_FUNCTION__));

7020

unsigned NumElts = SizeInBits / EltSizeInBits;

7021

7022

// Bitcast a source array of element bits to the target size.

7023

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

7024

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

7025

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

7026

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7027, __extension__
__PRETTY_FUNCTION__))

7027

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7027, __extension__
__PRETTY_FUNCTION__));

7028

7029

// Don't split if we don't allow undef bits.

7030

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

7031

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

7032

return false;

7033

7034

// If we're already the right size, don't bother bitcasting.

7035

if (NumSrcElts == NumElts) {

7036

UndefElts = UndefSrcElts;

7037

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

7038

return true;

7039

}

7040

7041

// Extract all the undef/constant element data and pack into single bitsets.

7042

APInt UndefBits(SizeInBits, 0);

7043

APInt MaskBits(SizeInBits, 0);

7044

7045

for (unsigned i = 0; i != NumSrcElts; ++i) {

7046

unsigned BitOffset = i * SrcEltSizeInBits;

7047

if (UndefSrcElts[i])

7048

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

7049

MaskBits.insertBits(SrcEltBits[i], BitOffset);

7050

}

7051

7052

// Split the undef/constant single bitset data into the target elements.

7053

UndefElts = APInt(NumElts, 0);

7054

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

7055

7056

for (unsigned i = 0; i != NumElts; ++i) {

7057

unsigned BitOffset = i * EltSizeInBits;

7058

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

7059

7060

// Only treat an element as UNDEF if all bits are UNDEF.

7061

if (UndefEltBits.isAllOnes()) {

7062

if (!AllowWholeUndefs)

7063

return false;

7064

UndefElts.setBit(i);

7065

continue;

7066

}

7067

7068

// If only some bits are UNDEF then treat them as zero (or bail if not

7069

// supported).

7070

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

7071

return false;

7072

7073

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

7074

}

7075

return true;

7076

};

7077

7078

// Collect constant bits and insert into mask/undef bit masks.

7079

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

7080

unsigned UndefBitIndex) {

7081

if (!Cst)

7082

return false;

7083

if (isa<UndefValue>(Cst)) {

7084

Undefs.setBit(UndefBitIndex);

7085

return true;

7086

}

7087

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

7088

Mask = CInt->getValue();

7089

return true;

7090

}

7091

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

7092

Mask = CFP->getValueAPF().bitcastToAPInt();

7093

return true;

7094

}

7095

return false;

7096

};

7097

7098

// Handle UNDEFs.

7099

if (Op.isUndef()) {

7100

APInt UndefSrcElts = APInt::getAllOnes(NumElts);

7101

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

7102

return CastBitData(UndefSrcElts, SrcEltBits);

7103

}

7104

7105

// Extract scalar constant bits.

7106

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

7107

APInt UndefSrcElts = APInt::getZero(1);

7108

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

7109

return CastBitData(UndefSrcElts, SrcEltBits);

7110

}

7111

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7112

APInt UndefSrcElts = APInt::getZero(1);

7113

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

7114

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

7115

return CastBitData(UndefSrcElts, SrcEltBits);

7116

}

7117

7118

// Extract constant bits from build vector.

7119

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

7120

BitVector Undefs;

7121

SmallVector<APInt> SrcEltBits;

7122

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7123

if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

7124

APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());

7125

for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

7126

if (Undefs[I])

7127

UndefSrcElts.setBit(I);

7128

return CastBitData(UndefSrcElts, SrcEltBits);

7129

}

7130

}

7131

7132

// Extract constant bits from constant pool vector.

7133

if (auto *Cst = getTargetConstantFromNode(Op)) {

7134

Type *CstTy = Cst->getType();

7135

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7136

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

7137

return false;

7138

7139

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

7140

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7141

7142

APInt UndefSrcElts(NumSrcElts, 0);

7143

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

7144

for (unsigned i = 0; i != NumSrcElts; ++i)

7145

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

7146

UndefSrcElts, i))

7147

return false;

7148

7149

return CastBitData(UndefSrcElts, SrcEltBits);

7150

}

7151

7152

// Extract constant bits from a broadcasted constant pool scalar.

7153

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

7154

EltSizeInBits <= VT.getScalarSizeInBits()) {

7155

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7156

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

7157

return false;

7158

7159

SDValue Ptr = MemIntr->getBasePtr();

7160

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

7161

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

7162

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7163

7164

APInt UndefSrcElts(NumSrcElts, 0);

7165

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

7166

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

7167

if (UndefSrcElts[0])

7168

UndefSrcElts.setBits(0, NumSrcElts);

7169

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

7170

return CastBitData(UndefSrcElts, SrcEltBits);

7171

}

7172

}

7173

}

7174

7175

// Extract constant bits from a subvector broadcast.

7176

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

7177

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7178

SDValue Ptr = MemIntr->getBasePtr();

7179

// The source constant may be larger than the subvector broadcast,

7180

// ensure we extract the correct subvector constants.

7181

if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

7182

Type *CstTy = Cst->getType();

7183

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7184

unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

7185

if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

7186

(SizeInBits % SubVecSizeInBits) != 0)

7187

return false;

7188

unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

7189

unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

7190

unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

7191

APInt UndefSubElts(NumSubElts, 0);

7192

SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

7193

APInt(CstEltSizeInBits, 0));

7194

for (unsigned i = 0; i != NumSubElts; ++i) {

7195

if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

7196

UndefSubElts, i))

7197

return false;

7198

for (unsigned j = 1; j != NumSubVecs; ++j)

7199

SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

7200

}

7201

UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

7202

UndefSubElts);

7203

return CastBitData(UndefSubElts, SubEltBits);

7204

}

7205

}

7206

7207

// Extract a rematerialized scalar constant insertion.

7208

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

7209

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

7210

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

7211

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7212

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7213

7214

APInt UndefSrcElts(NumSrcElts, 0);

7215

SmallVector<APInt, 64> SrcEltBits;

7216

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

7217

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

7218

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

7219

return CastBitData(UndefSrcElts, SrcEltBits);

7220

}

7221

7222

// Insert constant bits from a base and sub vector sources.

7223

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

7224

// If bitcasts to larger elements we might lose track of undefs - don't

7225

// allow any to be safe.

7226

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7227

bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;

7228

7229

APInt UndefSrcElts, UndefSubElts;

7230

SmallVector<APInt, 32> EltSrcBits, EltSubBits;

7231

if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

7232

UndefSubElts, EltSubBits,

7233

AllowWholeUndefs && AllowUndefs,

7234

AllowPartialUndefs && AllowUndefs) &&

7235

getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

7236

UndefSrcElts, EltSrcBits,

7237

AllowWholeUndefs && AllowUndefs,

7238

AllowPartialUndefs && AllowUndefs)) {

7239

unsigned BaseIdx = Op.getConstantOperandVal(2);

7240

UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

7241

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

7242

EltSrcBits[BaseIdx + i] = EltSubBits[i];

7243

return CastBitData(UndefSrcElts, EltSrcBits);

7244

}

7245

}

7246

7247

// Extract constant bits from a subvector's source.

7248

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

7249

// TODO - support extract_subvector through bitcasts.

7250

if (EltSizeInBits != VT.getScalarSizeInBits())

7251

return false;

7252

7253

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7254

UndefElts, EltBits, AllowWholeUndefs,

7255

AllowPartialUndefs)) {

7256

EVT SrcVT = Op.getOperand(0).getValueType();

7257

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7258

unsigned NumSubElts = VT.getVectorNumElements();

7259

unsigned BaseIdx = Op.getConstantOperandVal(1);

7260

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

7261

if ((BaseIdx + NumSubElts) != NumSrcElts)

7262

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

7263

if (BaseIdx != 0)

7264

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

7265

return true;

7266

}

7267

}

7268

7269

// Extract constant bits from shuffle node sources.

7270

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

7271

// TODO - support shuffle through bitcasts.

7272

if (EltSizeInBits != VT.getScalarSizeInBits())

7273

return false;

7274

7275

ArrayRef<int> Mask = SVN->getMask();

7276

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

7277

llvm::any_of(Mask, [](int M) { return M < 0; }))

7278

return false;

7279

7280

APInt UndefElts0, UndefElts1;

7281

SmallVector<APInt, 32> EltBits0, EltBits1;

7282

if (isAnyInRange(Mask, 0, NumElts) &&

7283

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7284

UndefElts0, EltBits0, AllowWholeUndefs,

7285

AllowPartialUndefs))

7286

return false;

7287

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

7288

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

7289

UndefElts1, EltBits1, AllowWholeUndefs,

7290

AllowPartialUndefs))

7291

return false;

7292

7293

UndefElts = APInt::getZero(NumElts);

7294

for (int i = 0; i != (int)NumElts; ++i) {

7295

int M = Mask[i];

7296

if (M < 0) {

7297

UndefElts.setBit(i);

7298

EltBits.push_back(APInt::getZero(EltSizeInBits));

7299

} else if (M < (int)NumElts) {

7300

if (UndefElts0[M])

7301

UndefElts.setBit(i);

7302

EltBits.push_back(EltBits0[M]);

7303

} else {

7304

if (UndefElts1[M - NumElts])

7305

UndefElts.setBit(i);

7306

EltBits.push_back(EltBits1[M - NumElts]);

7307

}

7308

}

7309

return true;

7310

}

7311

7312

return false;

7313

}

7314

7315

namespace llvm {

7316

namespace X86 {

7317

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

7318

APInt UndefElts;

7319

SmallVector<APInt, 16> EltBits;

7320

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

7321

UndefElts, EltBits, true,

7322

AllowPartialUndefs)) {

7323

int SplatIndex = -1;

7324

for (int i = 0, e = EltBits.size(); i != e; ++i) {

7325

if (UndefElts[i])

7326

continue;

7327

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

7328

SplatIndex = -1;

7329

break;

7330

}

7331

SplatIndex = i;

7332

}

7333

if (0 <= SplatIndex) {

7334

SplatVal = EltBits[SplatIndex];

7335

return true;

7336

}

7337

}

7338

7339

return false;

7340

}

7341

} // namespace X86

7342

} // namespace llvm

7343

7344

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

7345

unsigned MaskEltSizeInBits,

7346

SmallVectorImpl<uint64_t> &RawMask,

7347

APInt &UndefElts) {

7348

// Extract the raw target constant bits.

7349

SmallVector<APInt, 64> EltBits;

7350

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

7351

EltBits, /* AllowWholeUndefs */ true,

7352

/* AllowPartialUndefs */ false))

7353

return false;

7354

7355

// Insert the extracted elements into the mask.

7356

for (const APInt &Elt : EltBits)

7357

RawMask.push_back(Elt.getZExtValue());

7358

7359

return true;

7360

}

7361

7362

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

7363

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

7364

/// Note: This ignores saturation, so inputs must be checked first.

7365

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7366

bool Unary, unsigned NumStages = 1) {

7367

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7367, __extension__
__PRETTY_FUNCTION__));

7368

unsigned NumElts = VT.getVectorNumElements();

7369

unsigned NumLanes = VT.getSizeInBits() / 128;

7370

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

7371

unsigned Offset = Unary ? 0 : NumElts;

7372

unsigned Repetitions = 1u << (NumStages - 1);

7373

unsigned Increment = 1u << NumStages;

7374

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7374, __extension__
__PRETTY_FUNCTION__));

7375

7376

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

7377

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

7378

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7379

Mask.push_back(Elt + (Lane * NumEltsPerLane));

7380

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7381

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

7382

}

7383

}

7384

}

7385

7386

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

7387

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

7388

APInt &DemandedLHS, APInt &DemandedRHS) {

7389

int NumLanes = VT.getSizeInBits() / 128;

7390

int NumElts = DemandedElts.getBitWidth();

7391

int NumInnerElts = NumElts / 2;

7392

int NumEltsPerLane = NumElts / NumLanes;

7393

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

7394

7395

DemandedLHS = APInt::getZero(NumInnerElts);

7396

DemandedRHS = APInt::getZero(NumInnerElts);

7397

7398

// Map DemandedElts to the packed operands.

7399

for (int Lane = 0; Lane != NumLanes; ++Lane) {

7400

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

7401

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

7402

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

7403

if (DemandedElts[OuterIdx])

7404

DemandedLHS.setBit(InnerIdx);

7405

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

7406

DemandedRHS.setBit(InnerIdx);

7407

}

7408

}

7409

}

7410

7411

// Split the demanded elts of a HADD/HSUB node between its operands.

7412

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

7413

APInt &DemandedLHS, APInt &DemandedRHS) {

7414

int NumLanes = VT.getSizeInBits() / 128;

7415

int NumElts = DemandedElts.getBitWidth();

7416

int NumEltsPerLane = NumElts / NumLanes;

7417

int HalfEltsPerLane = NumEltsPerLane / 2;

7418

7419

DemandedLHS = APInt::getZero(NumElts);

7420

DemandedRHS = APInt::getZero(NumElts);

7421

7422

// Map DemandedElts to the horizontal operands.

7423

for (int Idx = 0; Idx != NumElts; ++Idx) {

7424

if (!DemandedElts[Idx])

7425

continue;

7426

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

7427

int LocalIdx = Idx % NumEltsPerLane;

7428

if (LocalIdx < HalfEltsPerLane) {

7429

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7430

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7431

} else {

7432

LocalIdx -= HalfEltsPerLane;

7433

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7434

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7435

}

7436

}

7437

}

7438

7439

/// Calculates the shuffle mask corresponding to the target-specific opcode.

7440

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

7441

/// operands in \p Ops, and returns true.

7442

/// Sets \p IsUnary to true if only one source is used. Note that this will set

7443

/// IsUnary for shuffles which use a single input multiple times, and in those

7444

/// cases it will adjust the mask to only have indices within that single input.

7445

/// It is an error to call this with non-empty Mask/Ops vectors.

7446

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7447

SmallVectorImpl<SDValue> &Ops,

7448

SmallVectorImpl<int> &Mask, bool &IsUnary) {

7449

unsigned NumElems = VT.getVectorNumElements();

7450

unsigned MaskEltSize = VT.getScalarSizeInBits();

7451

SmallVector<uint64_t, 32> RawMask;

7452

APInt RawUndefs;

7453

uint64_t ImmN;

7454

7455

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7455, __extension__
__PRETTY_FUNCTION__));

7456

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7456, __extension__
__PRETTY_FUNCTION__));

7457

7458

IsUnary = false;

7459

bool IsFakeUnary = false;

7460

switch (N->getOpcode()) {

7461

case X86ISD::BLENDI:

7462

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7462, __extension__
__PRETTY_FUNCTION__));

7463

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7463, __extension__
__PRETTY_FUNCTION__));

7464

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7465

DecodeBLENDMask(NumElems, ImmN, Mask);

7466

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7467

break;

7468

case X86ISD::SHUFP:

7469

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7469, __extension__
__PRETTY_FUNCTION__));

7470

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7470, __extension__
__PRETTY_FUNCTION__));

7471

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7472

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

7473

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7474

break;

7475

case X86ISD::INSERTPS:

7476

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7476, __extension__
__PRETTY_FUNCTION__));

7477

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7477, __extension__
__PRETTY_FUNCTION__));

7478

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7479

DecodeINSERTPSMask(ImmN, Mask);

7480

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7481

break;

7482

case X86ISD::EXTRQI:

7483

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7483, __extension__
__PRETTY_FUNCTION__));

7484

if (isa<ConstantSDNode>(N->getOperand(1)) &&

7485

isa<ConstantSDNode>(N->getOperand(2))) {

7486

int BitLen = N->getConstantOperandVal(1);

7487

int BitIdx = N->getConstantOperandVal(2);

7488

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7489

IsUnary = true;

7490

}

7491

break;

7492

case X86ISD::INSERTQI:

7493

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7493, __extension__
__PRETTY_FUNCTION__));

7494

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7494, __extension__
__PRETTY_FUNCTION__));

7495

if (isa<ConstantSDNode>(N->getOperand(2)) &&

7496

isa<ConstantSDNode>(N->getOperand(3))) {

7497

int BitLen = N->getConstantOperandVal(2);

7498

int BitIdx = N->getConstantOperandVal(3);

7499

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7500

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7501

}

7502

break;

7503

case X86ISD::UNPCKH:

7504

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7504, __extension__
__PRETTY_FUNCTION__));

7505

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7505, __extension__
__PRETTY_FUNCTION__));

7506

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

7507

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7508

break;

7509

case X86ISD::UNPCKL:

7510

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7510, __extension__
__PRETTY_FUNCTION__));

7511

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7511, __extension__
__PRETTY_FUNCTION__));

7512

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

7513

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7514

break;

7515

case X86ISD::MOVHLPS:

7516

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7516, __extension__
__PRETTY_FUNCTION__));

7517

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7517, __extension__
__PRETTY_FUNCTION__));

7518

DecodeMOVHLPSMask(NumElems, Mask);

7519

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7520

break;

7521

case X86ISD::MOVLHPS:

7522

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7522, __extension__
__PRETTY_FUNCTION__));

7523

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7523, __extension__
__PRETTY_FUNCTION__));

7524

DecodeMOVLHPSMask(NumElems, Mask);

7525

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7526

break;

7527

case X86ISD::VALIGN:

7528

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7529, __extension__
__PRETTY_FUNCTION__))

7529

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7529, __extension__
__PRETTY_FUNCTION__));

7530

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7530, __extension__
__PRETTY_FUNCTION__));

7531

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7531, __extension__
__PRETTY_FUNCTION__));

7532

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7533

DecodeVALIGNMask(NumElems, ImmN, Mask);

7534

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7535

Ops.push_back(N->getOperand(1));

7536

Ops.push_back(N->getOperand(0));

7537

break;

7538

case X86ISD::PALIGNR:

7539

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7539, __extension__
__PRETTY_FUNCTION__));

7540

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7540, __extension__
__PRETTY_FUNCTION__));

7541

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7541, __extension__
__PRETTY_FUNCTION__));

7542

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7543

DecodePALIGNRMask(NumElems, ImmN, Mask);

7544

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7545

Ops.push_back(N->getOperand(1));

7546

Ops.push_back(N->getOperand(0));

7547

break;

7548

case X86ISD::VSHLDQ:

7549

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7549, __extension__
__PRETTY_FUNCTION__));

7550

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7550, __extension__
__PRETTY_FUNCTION__));

7551

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7552

DecodePSLLDQMask(NumElems, ImmN, Mask);

7553

IsUnary = true;

7554

break;

7555

case X86ISD::VSRLDQ:

7556

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7556, __extension__
__PRETTY_FUNCTION__));

7557

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7557, __extension__
__PRETTY_FUNCTION__));

7558

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7559

DecodePSRLDQMask(NumElems, ImmN, Mask);

7560

IsUnary = true;

7561

break;

7562

case X86ISD::PSHUFD:

7563

case X86ISD::VPERMILPI:

7564

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7564, __extension__
__PRETTY_FUNCTION__));

7565

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7566

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

7567

IsUnary = true;

7568

break;

7569

case X86ISD::PSHUFHW:

7570

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7570, __extension__
__PRETTY_FUNCTION__));

7571

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7572

DecodePSHUFHWMask(NumElems, ImmN, Mask);

7573

IsUnary = true;

7574

break;

7575

case X86ISD::PSHUFLW:

7576

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7576, __extension__
__PRETTY_FUNCTION__));

7577

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7578

DecodePSHUFLWMask(NumElems, ImmN, Mask);

7579

IsUnary = true;

7580

break;

7581

case X86ISD::VZEXT_MOVL:

7582

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7582, __extension__
__PRETTY_FUNCTION__));

7583

DecodeZeroMoveLowMask(NumElems, Mask);

7584

IsUnary = true;

7585

break;

7586

case X86ISD::VBROADCAST:

7587

// We only decode broadcasts of same-sized vectors, peeking through to

7588

// extracted subvectors is likely to cause hasOneUse issues with

7589

// SimplifyDemandedBits etc.

7590

if (N->getOperand(0).getValueType() == VT) {

7591

DecodeVectorBroadcast(NumElems, Mask);

7592

IsUnary = true;

7593

break;

7594

}

7595

return false;

7596

case X86ISD::VPERMILPV: {

7597

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7597, __extension__
__PRETTY_FUNCTION__));

7598

IsUnary = true;

7599

SDValue MaskNode = N->getOperand(1);

7600

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7601

RawUndefs)) {

7602

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

7603

break;

7604

}

7605

return false;

7606

}

7607

case X86ISD::PSHUFB: {

7608

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7608, __extension__
__PRETTY_FUNCTION__));

7609

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7609, __extension__
__PRETTY_FUNCTION__));

7610

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7610, __extension__
__PRETTY_FUNCTION__));

7611

IsUnary = true;

7612

SDValue MaskNode = N->getOperand(1);

7613

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7614

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

7615

break;

7616

}

7617

return false;

7618

}

7619

case X86ISD::VPERMI:

7620

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7620, __extension__
__PRETTY_FUNCTION__));

7621

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7622

DecodeVPERMMask(NumElems, ImmN, Mask);

7623

IsUnary = true;

7624

break;

7625

case X86ISD::MOVSS:

7626

case X86ISD::MOVSD:

7627

case X86ISD::MOVSH:

7628

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7628, __extension__
__PRETTY_FUNCTION__));

7629

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7629, __extension__
__PRETTY_FUNCTION__));

7630

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

7631

break;

7632

case X86ISD::VPERM2X128:

7633

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7633, __extension__
__PRETTY_FUNCTION__));

7634

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7634, __extension__
__PRETTY_FUNCTION__));

7635

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7636

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

7637

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7638

break;

7639

case X86ISD::SHUF128:

7640

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7640, __extension__
__PRETTY_FUNCTION__));

7641

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7641, __extension__
__PRETTY_FUNCTION__));

7642

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7643

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

7644

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7645

break;

7646

case X86ISD::MOVSLDUP:

7647

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7647, __extension__
__PRETTY_FUNCTION__));

7648

DecodeMOVSLDUPMask(NumElems, Mask);

7649

IsUnary = true;

7650

break;

7651

case X86ISD::MOVSHDUP:

7652

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7652, __extension__
__PRETTY_FUNCTION__));

7653

DecodeMOVSHDUPMask(NumElems, Mask);

7654

IsUnary = true;

7655

break;

7656

case X86ISD::MOVDDUP:

7657

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7657, __extension__
__PRETTY_FUNCTION__));

7658

DecodeMOVDDUPMask(NumElems, Mask);

7659

IsUnary = true;

7660

break;

7661

case X86ISD::VPERMIL2: {

7662

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7662, __extension__
__PRETTY_FUNCTION__));

7663

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7663, __extension__
__PRETTY_FUNCTION__));

7664

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7665

SDValue MaskNode = N->getOperand(2);

7666

SDValue CtrlNode = N->getOperand(3);

7667

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

7668

unsigned CtrlImm = CtrlOp->getZExtValue();

7669

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7670

RawUndefs)) {

7671

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

7672

Mask);

7673

break;

7674

}

7675

}

7676

return false;

7677

}

7678

case X86ISD::VPPERM: {

7679

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7679, __extension__
__PRETTY_FUNCTION__));

7680

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7680, __extension__
__PRETTY_FUNCTION__));

7681

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7682

SDValue MaskNode = N->getOperand(2);

7683

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7684

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

7685

break;

7686

}

7687

return false;

7688

}

7689

case X86ISD::VPERMV: {

7690

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__));

7691

IsUnary = true;

7692

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

7693

Ops.push_back(N->getOperand(1));

7694

SDValue MaskNode = N->getOperand(0);

7695

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7696

RawUndefs)) {

7697

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

7698

break;

7699

}

7700

return false;

7701

}

7702

case X86ISD::VPERMV3: {

7703

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7703, __extension__
__PRETTY_FUNCTION__));

7704

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__
__PRETTY_FUNCTION__));

7705

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

7706

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

7707

Ops.push_back(N->getOperand(0));

7708

Ops.push_back(N->getOperand(2));

7709

SDValue MaskNode = N->getOperand(1);

7710

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7711

RawUndefs)) {

7712

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

7713

break;

7714

}

7715

return false;

7716

}

7717

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7717);

7718

}

7719

7720

// Empty mask indicates the decode failed.

7721

if (Mask.empty())

7722

return false;

7723

7724

// Check if we're getting a shuffle mask with zero'd elements.

7725

if (!AllowSentinelZero && isAnyZero(Mask))

7726

return false;

7727

7728

// If we have a fake unary shuffle, the shuffle mask is spread across two

7729

// inputs that are actually the same node. Re-map the mask to always point

7730

// into the first input.

7731

if (IsFakeUnary)

7732

for (int &M : Mask)

7733

if (M >= (int)Mask.size())

7734

M -= Mask.size();

7735

7736

// If we didn't already add operands in the opcode-specific code, default to

7737

// adding 1 or 2 operands starting at 0.

7738

if (Ops.empty()) {

7739

Ops.push_back(N->getOperand(0));

7740

if (!IsUnary || IsFakeUnary)

7741

Ops.push_back(N->getOperand(1));

7742

}

7743

7744

return true;

7745

}

7746

7747

// Wrapper for getTargetShuffleMask with InUnary;

7748

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7749

SmallVectorImpl<SDValue> &Ops,

7750

SmallVectorImpl<int> &Mask) {

7751

bool IsUnary;

7752

return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);

7753

}

7754

7755

/// Compute whether each element of a shuffle is zeroable.

7756

///

7757

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

7758

/// Either it is an undef element in the shuffle mask, the element of the input

7759

/// referenced is undef, or the element of the input referenced is known to be

7760

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

7761

/// as many lanes with this technique as possible to simplify the remaining

7762

/// shuffle.

7763

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

7764

SDValue V1, SDValue V2,

7765

APInt &KnownUndef, APInt &KnownZero) {

7766

int Size = Mask.size();

7767

KnownUndef = KnownZero = APInt::getZero(Size);

7768

7769

V1 = peekThroughBitcasts(V1);

7770

V2 = peekThroughBitcasts(V2);

7771

7772

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

7773

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

7774

7775

int VectorSizeInBits = V1.getValueSizeInBits();

7776

int ScalarSizeInBits = VectorSizeInBits / Size;

7777

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7777, __extension__
__PRETTY_FUNCTION__));

7778

7779

for (int i = 0; i < Size; ++i) {

7780

int M = Mask[i];

7781

// Handle the easy cases.

7782

if (M < 0) {

7783

KnownUndef.setBit(i);

7784

continue;

7785

}

7786

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

7787

KnownZero.setBit(i);

7788

continue;

7789

}

7790

7791

// Determine shuffle input and normalize the mask.

7792

SDValue V = M < Size ? V1 : V2;

7793

M %= Size;

7794

7795

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

7796

if (V.getOpcode() != ISD::BUILD_VECTOR)

7797

continue;

7798

7799

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

7800

// the (larger) source element must be UNDEF/ZERO.

7801

if ((Size % V.getNumOperands()) == 0) {

7802

int Scale = Size / V->getNumOperands();

7803

SDValue Op = V.getOperand(M / Scale);

7804

if (Op.isUndef())

7805

KnownUndef.setBit(i);

7806

if (X86::isZeroNode(Op))

7807

KnownZero.setBit(i);

7808

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

7809

APInt Val = Cst->getAPIntValue();

7810

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

7811

if (Val == 0)

7812

KnownZero.setBit(i);

7813

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7814

APInt Val = Cst->getValueAPF().bitcastToAPInt();

7815

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

7816

if (Val == 0)

7817

KnownZero.setBit(i);

7818

}

7819

continue;

7820

}

7821

7822

// If the BUILD_VECTOR has more elements then all the (smaller) source

7823

// elements must be UNDEF or ZERO.

7824

if ((V.getNumOperands() % Size) == 0) {

7825

int Scale = V->getNumOperands() / Size;

7826

bool AllUndef = true;

7827

bool AllZero = true;

7828

for (int j = 0; j < Scale; ++j) {

7829

SDValue Op = V.getOperand((M * Scale) + j);

7830

AllUndef &= Op.isUndef();

7831

AllZero &= X86::isZeroNode(Op);

7832

}

7833

if (AllUndef)

7834

KnownUndef.setBit(i);

7835

if (AllZero)

7836

KnownZero.setBit(i);

7837

continue;

7838

}

7839

}

7840

}

7841

7842

/// Decode a target shuffle mask and inputs and see if any values are

7843

/// known to be undef or zero from their inputs.

7844

/// Returns true if the target shuffle mask was decoded.

7845

/// FIXME: Merge this with computeZeroableShuffleElements?

7846

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

7847

SmallVectorImpl<SDValue> &Ops,

7848

APInt &KnownUndef, APInt &KnownZero) {

7849

bool IsUnary;

7850

if (!isTargetShuffle(N.getOpcode()))

7851

return false;

7852

7853

MVT VT = N.getSimpleValueType();

7854

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

7855

return false;

7856

7857

int Size = Mask.size();

7858

SDValue V1 = Ops[0];

7859

SDValue V2 = IsUnary ? V1 : Ops[1];

7860

KnownUndef = KnownZero = APInt::getZero(Size);

7861

7862

V1 = peekThroughBitcasts(V1);

7863

V2 = peekThroughBitcasts(V2);

7864

7865

assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))

7866

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__));

7867

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

7868

7869

// Extract known constant input data.

7870

APInt UndefSrcElts[2];

7871

SmallVector<APInt, 32> SrcEltBits[2];

7872

bool IsSrcConstant[2] = {

7873

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

7874

SrcEltBits[0], true, false),

7875

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

7876

SrcEltBits[1], true, false)};

7877

7878

for (int i = 0; i < Size; ++i) {

7879

int M = Mask[i];

7880

7881

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

7882

if (M < 0) {

7883

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7883, __extension__
__PRETTY_FUNCTION__));

7884

if (SM_SentinelUndef == M)

7885

KnownUndef.setBit(i);

7886

if (SM_SentinelZero == M)

7887

KnownZero.setBit(i);

7888

continue;

7889

}

7890

7891

// Determine shuffle input and normalize the mask.

7892

unsigned SrcIdx = M / Size;

7893

SDValue V = M < Size ? V1 : V2;

7894

M %= Size;

7895

7896

// We are referencing an UNDEF input.

7897

if (V.isUndef()) {

7898

KnownUndef.setBit(i);

7899

continue;

7900

}

7901

7902

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

7903

// TODO: We currently only set UNDEF for integer types - floats use the same

7904

// registers as vectors and many of the scalar folded loads rely on the

7905

// SCALAR_TO_VECTOR pattern.

7906

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

7907

(Size % V.getValueType().getVectorNumElements()) == 0) {

7908

int Scale = Size / V.getValueType().getVectorNumElements();

7909

int Idx = M / Scale;

7910

if (Idx != 0 && !VT.isFloatingPoint())

7911

KnownUndef.setBit(i);

7912

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

7913

KnownZero.setBit(i);

7914

continue;

7915

}

7916

7917

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

7918

// base vectors.

7919

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

7920

SDValue Vec = V.getOperand(0);

7921

int NumVecElts = Vec.getValueType().getVectorNumElements();

7922

if (Vec.isUndef() && Size == NumVecElts) {

7923

int Idx = V.getConstantOperandVal(2);

7924

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

7925

if (M < Idx || (Idx + NumSubElts) <= M)

7926

KnownUndef.setBit(i);

7927

}

7928

continue;

7929

}

7930

7931

// Attempt to extract from the source's constant bits.

7932

if (IsSrcConstant[SrcIdx]) {

7933

if (UndefSrcElts[SrcIdx][M])

7934

KnownUndef.setBit(i);

7935

else if (SrcEltBits[SrcIdx][M] == 0)

7936

KnownZero.setBit(i);

7937

}

7938

}

7939

7940

assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7941, __extension__
__PRETTY_FUNCTION__))

7941

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7941, __extension__
__PRETTY_FUNCTION__));

7942

return true;

7943

}

7944

7945

// Replace target shuffle mask elements with known undef/zero sentinels.

7946

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

7947

const APInt &KnownUndef,

7948

const APInt &KnownZero,

7949

bool ResolveKnownZeros= true) {

7950

unsigned NumElts = Mask.size();

7951

assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__))

7952

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__));

7953

7954

for (unsigned i = 0; i != NumElts; ++i) {

7955

if (KnownUndef[i])

7956

Mask[i] = SM_SentinelUndef;

7957

else if (ResolveKnownZeros && KnownZero[i])

7958

Mask[i] = SM_SentinelZero;

7959

}

7960

}

7961

7962

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

7963

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

7964

APInt &KnownUndef,

7965

APInt &KnownZero) {

7966

unsigned NumElts = Mask.size();

7967

KnownUndef = KnownZero = APInt::getZero(NumElts);

7968

7969

for (unsigned i = 0; i != NumElts; ++i) {

7970

int M = Mask[i];

7971

if (SM_SentinelUndef == M)

7972

KnownUndef.setBit(i);

7973

if (SM_SentinelZero == M)

7974

KnownZero.setBit(i);

7975

}

7976

}

7977

7978

// Forward declaration (for getFauxShuffleMask recursive check).

7979

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

7980

SmallVectorImpl<int> &Mask,

7981

const SelectionDAG &DAG, unsigned Depth,

7982

bool ResolveKnownElts);

7983

7984

// Attempt to decode ops that could be represented as a shuffle mask.

7985

// The decoded shuffle mask may contain a different number of elements to the

7986

// destination value type.

7987

// TODO: Merge into getTargetShuffleInputs()

7988

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

7989

SmallVectorImpl<int> &Mask,

7990

SmallVectorImpl<SDValue> &Ops,

7991

const SelectionDAG &DAG, unsigned Depth,

7992

bool ResolveKnownElts) {

7993

Mask.clear();

7994

Ops.clear();

7995

7996

MVT VT = N.getSimpleValueType();

7997

unsigned NumElts = VT.getVectorNumElements();

7998

unsigned NumSizeInBits = VT.getSizeInBits();

7999

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

8000

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

8001

return false;

8002

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8002, __extension__
__PRETTY_FUNCTION__));

8003

unsigned NumSizeInBytes = NumSizeInBits / 8;

8004

unsigned NumBytesPerElt = NumBitsPerElt / 8;

8005

8006

unsigned Opcode = N.getOpcode();

8007

switch (Opcode) {

8008

case ISD::VECTOR_SHUFFLE: {

8009

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

8010

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

8011

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

8012

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

8013

Ops.push_back(N.getOperand(0));

8014

Ops.push_back(N.getOperand(1));

8015

return true;

8016

}

8017

return false;

8018

}

8019

case ISD::AND:

8020

case X86ISD::ANDNP: {

8021

// Attempt to decode as a per-byte mask.

8022

APInt UndefElts;

8023

SmallVector<APInt, 32> EltBits;

8024

SDValue N0 = N.getOperand(0);

8025

SDValue N1 = N.getOperand(1);

8026

bool IsAndN = (X86ISD::ANDNP == Opcode);

8027

uint64_t ZeroMask = IsAndN ? 255 : 0;

8028

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

8029

return false;

8030

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

8031

if (UndefElts[i]) {

8032

Mask.push_back(SM_SentinelUndef);

8033

continue;

8034

}

8035

const APInt &ByteBits = EltBits[i];

8036

if (ByteBits != 0 && ByteBits != 255)

8037

return false;

8038

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

8039

}

8040

Ops.push_back(IsAndN ? N1 : N0);

8041

return true;

8042

}

8043

case ISD::OR: {

8044

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

8045

// is a valid shuffle index.

8046

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

8047

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

8048

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

8049

return false;

8050

SmallVector<int, 64> SrcMask0, SrcMask1;

8051

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

8052

if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,

8053

true) ||

8054

!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,

8055

true))

8056

return false;

8057

8058

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

8059

SmallVector<int, 64> Mask0, Mask1;

8060

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

8061

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

8062

for (int i = 0; i != (int)MaskSize; ++i) {

8063

// NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

8064

// loops converting between OR and BLEND shuffles due to

8065

// canWidenShuffleElements merging away undef elements, meaning we

8066

// fail to recognise the OR as the undef element isn't known zero.

8067

if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

8068

Mask.push_back(SM_SentinelZero);

8069

else if (Mask1[i] == SM_SentinelZero)

8070

Mask.push_back(i);

8071

else if (Mask0[i] == SM_SentinelZero)

8072

Mask.push_back(i + MaskSize);

8073

else

8074

return false;

8075

}

8076

Ops.push_back(N0);

8077

Ops.push_back(N1);

8078

return true;

8079

}

8080

case ISD::INSERT_SUBVECTOR: {

8081

SDValue Src = N.getOperand(0);

8082

SDValue Sub = N.getOperand(1);

8083

EVT SubVT = Sub.getValueType();

8084

unsigned NumSubElts = SubVT.getVectorNumElements();

8085

if (!N->isOnlyUserOf(Sub.getNode()))

8086

return false;

8087

uint64_t InsertIdx = N.getConstantOperandVal(2);

8088

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

8089

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

8090

Sub.getOperand(0).getValueType() == VT) {

8091

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

8092

for (int i = 0; i != (int)NumElts; ++i)

8093

Mask.push_back(i);

8094

for (int i = 0; i != (int)NumSubElts; ++i)

8095

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

8096

Ops.push_back(Src);

8097

Ops.push_back(Sub.getOperand(0));

8098

return true;

8099

}

8100

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

8101

SmallVector<int, 64> SubMask;

8102

SmallVector<SDValue, 2> SubInputs;

8103

if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,

8104

SubMask, DAG, Depth + 1, ResolveKnownElts))

8105

return false;

8106

8107

// Subvector shuffle inputs must not be larger than the subvector.

8108

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

8109

return SubVT.getFixedSizeInBits() <

8110

SubInput.getValueSizeInBits().getFixedSize();

8111

}))

8112

return false;

8113

8114

if (SubMask.size() != NumSubElts) {

8115

assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8116, __extension__
__PRETTY_FUNCTION__))

8116

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8116, __extension__
__PRETTY_FUNCTION__));

8117

if ((NumSubElts % SubMask.size()) == 0) {

8118

int Scale = NumSubElts / SubMask.size();

8119

SmallVector<int,64> ScaledSubMask;

8120

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

8121

SubMask = ScaledSubMask;

8122

} else {

8123

int Scale = SubMask.size() / NumSubElts;

8124

NumSubElts = SubMask.size();

8125

NumElts *= Scale;

8126

InsertIdx *= Scale;

8127

}

8128

}

8129

Ops.push_back(Src);

8130

Ops.append(SubInputs.begin(), SubInputs.end());

8131

if (ISD::isBuildVectorAllZeros(Src.getNode()))

8132

Mask.append(NumElts, SM_SentinelZero);

8133

else

8134

for (int i = 0; i != (int)NumElts; ++i)

8135

Mask.push_back(i);

8136

for (int i = 0; i != (int)NumSubElts; ++i) {

8137

int M = SubMask[i];

8138

if (0 <= M) {

8139

int InputIdx = M / NumSubElts;

8140

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

8141

}

8142

Mask[i + InsertIdx] = M;

8143

}

8144

return true;

8145

}

8146

case X86ISD::PINSRB:

8147

case X86ISD::PINSRW:

8148

case ISD::SCALAR_TO_VECTOR:

8149

case ISD::INSERT_VECTOR_ELT: {

8150

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

8151

// vector, for matching src/dst vector types.

8152

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

8153

8154

unsigned DstIdx = 0;

8155

if (Opcode != ISD::SCALAR_TO_VECTOR) {

8156

// Check we have an in-range constant insertion index.

8157

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

8158

N.getConstantOperandAPInt(2).uge(NumElts))

8159

return false;

8160

DstIdx = N.getConstantOperandVal(2);

8161

8162

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

8163

if (X86::isZeroNode(Scl)) {

8164

Ops.push_back(N.getOperand(0));

8165

for (unsigned i = 0; i != NumElts; ++i)

8166

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

8167

return true;

8168

}

8169

}

8170

8171

// Peek through trunc/aext/zext.

8172

// TODO: aext shouldn't require SM_SentinelZero padding.

8173

// TODO: handle shift of scalars.

8174

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

8175

while (Scl.getOpcode() == ISD::TRUNCATE ||

8176

Scl.getOpcode() == ISD::ANY_EXTEND ||

8177

Scl.getOpcode() == ISD::ZERO_EXTEND) {

8178

Scl = Scl.getOperand(0);

8179

MinBitsPerElt =

8180

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

8181

}

8182

if ((MinBitsPerElt % 8) != 0)

8183

return false;

8184

8185

// Attempt to find the source vector the scalar was extracted from.

8186

SDValue SrcExtract;

8187

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

8188

Scl.getOpcode() == X86ISD::PEXTRW ||

8189

Scl.getOpcode() == X86ISD::PEXTRB) &&

8190

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

8191

SrcExtract = Scl;

8192

}

8193

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

8194

return false;

8195

8196

SDValue SrcVec = SrcExtract.getOperand(0);

8197

EVT SrcVT = SrcVec.getValueType();

8198

if (!SrcVT.getScalarType().isByteSized())

8199

return false;

8200

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

8201

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

8202

unsigned DstByte = DstIdx * NumBytesPerElt;

8203

MinBitsPerElt =

8204

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

8205

8206

// Create 'identity' byte level shuffle mask and then add inserted bytes.

8207

if (Opcode == ISD::SCALAR_TO_VECTOR) {

8208

Ops.push_back(SrcVec);

8209

Mask.append(NumSizeInBytes, SM_SentinelUndef);

8210

} else {

8211

Ops.push_back(SrcVec);

8212

Ops.push_back(N.getOperand(0));

8213

for (int i = 0; i != (int)NumSizeInBytes; ++i)

8214

Mask.push_back(NumSizeInBytes + i);

8215

}

8216

8217

unsigned MinBytesPerElts = MinBitsPerElt / 8;

8218

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

8219

for (unsigned i = 0; i != MinBytesPerElts; ++i)

8220

Mask[DstByte + i] = SrcByte + i;

8221

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

8222

Mask[DstByte + i] = SM_SentinelZero;

8223

return true;

8224

}

8225

case X86ISD::PACKSS:

8226

case X86ISD::PACKUS: {

8227

SDValue N0 = N.getOperand(0);

8228

SDValue N1 = N.getOperand(1);

8229

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))

8230

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__))

8231

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8231, __extension__
__PRETTY_FUNCTION__));

8232

8233

APInt EltsLHS, EltsRHS;

8234

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

8235

8236

// If we know input saturation won't happen (or we don't care for particular

8237

// lanes), we can treat this as a truncation shuffle.

8238

bool Offset0 = false, Offset1 = false;

8239

if (Opcode == X86ISD::PACKSS) {

8240

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8241

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

8242

(!(N1.isUndef() || EltsRHS.isZero()) &&

8243

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

8244

return false;

8245

// We can't easily fold ASHR into a shuffle, but if it was feeding a

8246

// PACKSS then it was likely being used for sign-extension for a

8247

// truncation, so just peek through and adjust the mask accordingly.

8248

if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

8249

N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

8250

Offset0 = true;

8251

N0 = N0.getOperand(0);

8252

}

8253

if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

8254

N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

8255

Offset1 = true;

8256

N1 = N1.getOperand(0);

8257

}

8258

} else {

8259

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

8260

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8261

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

8262

(!(N1.isUndef() || EltsRHS.isZero()) &&

8263

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

8264

return false;

8265

}

8266

8267

bool IsUnary = (N0 == N1);

8268

8269

Ops.push_back(N0);

8270

if (!IsUnary)

8271

Ops.push_back(N1);

8272

8273

createPackShuffleMask(VT, Mask, IsUnary);

8274

8275

if (Offset0 || Offset1) {

8276

for (int &M : Mask)

8277

if ((Offset0 && isInRange(M, 0, NumElts)) ||

8278

(Offset1 && isInRange(M, NumElts, 2 * NumElts)))

8279

++M;

8280

}

8281

return true;

8282

}

8283

case X86ISD::VTRUNC: {

8284

SDValue Src = N.getOperand(0);

8285

EVT SrcVT = Src.getValueType();

8286

// Truncated source must be a simple vector.

8287

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8288

(SrcVT.getScalarSizeInBits() % 8) != 0)

8289

return false;

8290

unsigned NumSrcElts = SrcVT.getVectorNumElements();

8291

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

8292

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

8293

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8293, __extension__
__PRETTY_FUNCTION__));

8294

for (unsigned i = 0; i != NumSrcElts; ++i)

8295

Mask.push_back(i * Scale);

8296

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

8297

Ops.push_back(Src);

8298

return true;

8299

}

8300

case X86ISD::VSHLI:

8301

case X86ISD::VSRLI: {

8302

uint64_t ShiftVal = N.getConstantOperandVal(1);

8303

// Out of range bit shifts are guaranteed to be zero.

8304

if (NumBitsPerElt <= ShiftVal) {

8305

Mask.append(NumElts, SM_SentinelZero);

8306

return true;

8307

}

8308

8309

// We can only decode 'whole byte' bit shifts as shuffles.

8310

if ((ShiftVal % 8) != 0)

8311

break;

8312

8313

uint64_t ByteShift = ShiftVal / 8;

8314

Ops.push_back(N.getOperand(0));

8315

8316

// Clear mask to all zeros and insert the shifted byte indices.

8317

Mask.append(NumSizeInBytes, SM_SentinelZero);

8318

8319

if (X86ISD::VSHLI == Opcode) {

8320

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8321

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8322

Mask[i + j] = i + j - ByteShift;

8323

} else {

8324

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8325

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8326

Mask[i + j - ByteShift] = i + j;

8327

}

8328

return true;

8329

}

8330

case X86ISD::VROTLI:

8331

case X86ISD::VROTRI: {

8332

// We can only decode 'whole byte' bit rotates as shuffles.

8333

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

8334

if ((RotateVal % 8) != 0)

8335

return false;

8336

Ops.push_back(N.getOperand(0));

8337

int Offset = RotateVal / 8;

8338

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

8339

for (int i = 0; i != (int)NumElts; ++i) {

8340

int BaseIdx = i * NumBytesPerElt;

8341

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

8342

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

8343

}

8344

}

8345

return true;

8346

}

8347

case X86ISD::VBROADCAST: {

8348

SDValue Src = N.getOperand(0);

8349

if (!Src.getSimpleValueType().isVector()) {

8350

if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8351

!isNullConstant(Src.getOperand(1)) ||

8352

Src.getOperand(0).getValueType().getScalarType() !=

8353

VT.getScalarType())

8354

return false;

8355

Src = Src.getOperand(0);

8356

}

8357

Ops.push_back(Src);

8358

Mask.append(NumElts, 0);

8359

return true;

8360

}

8361

case ISD::ZERO_EXTEND:

8362

case ISD::ANY_EXTEND:

8363

case ISD::ZERO_EXTEND_VECTOR_INREG:

8364

case ISD::ANY_EXTEND_VECTOR_INREG: {

8365

SDValue Src = N.getOperand(0);

8366

EVT SrcVT = Src.getValueType();

8367

8368

// Extended source must be a simple vector.

8369

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8370

(SrcVT.getScalarSizeInBits() % 8) != 0)

8371

return false;

8372

8373

bool IsAnyExtend =

8374

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

8375

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

8376

IsAnyExtend, Mask);

8377

Ops.push_back(Src);

8378

return true;

8379

}

8380

}

8381

8382

return false;

8383

}

8384

8385

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

8386

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

8387

SmallVectorImpl<int> &Mask) {

8388

int MaskWidth = Mask.size();

8389

SmallVector<SDValue, 16> UsedInputs;

8390

for (int i = 0, e = Inputs.size(); i < e; ++i) {

8391

int lo = UsedInputs.size() * MaskWidth;

8392

int hi = lo + MaskWidth;

8393

8394

// Strip UNDEF input usage.

8395

if (Inputs[i].isUndef())

8396

for (int &M : Mask)

8397

if ((lo <= M) && (M < hi))

8398

M = SM_SentinelUndef;

8399

8400

// Check for unused inputs.

8401

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

8402

for (int &M : Mask)

8403

if (lo <= M)

8404

M -= MaskWidth;

8405

continue;

8406

}

8407

8408

// Check for repeated inputs.

8409

bool IsRepeat = false;

8410

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

8411

if (UsedInputs[j] != Inputs[i])

8412

continue;

8413

for (int &M : Mask)

8414

if (lo <= M)

8415

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

8416

IsRepeat = true;

8417

break;

8418

}

8419

if (IsRepeat)

8420

continue;

8421

8422

UsedInputs.push_back(Inputs[i]);

8423

}

8424

Inputs = UsedInputs;

8425

}

8426

8427

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

8428

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

8429

/// Returns true if the target shuffle mask was decoded.

8430

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8431

SmallVectorImpl<SDValue> &Inputs,

8432

SmallVectorImpl<int> &Mask,

8433

APInt &KnownUndef, APInt &KnownZero,

8434

const SelectionDAG &DAG, unsigned Depth,

8435

bool ResolveKnownElts) {

8436

if (Depth >= SelectionDAG::MaxRecursionDepth)

8437

return false; // Limit search depth.

8438

8439

EVT VT = Op.getValueType();

8440

if (!VT.isSimple() || !VT.isVector())

8441

return false;

8442

8443

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

8444

if (ResolveKnownElts)

8445

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

8446

return true;

8447

}

8448

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

8449

ResolveKnownElts)) {

8450

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

8451

return true;

8452

}

8453

return false;

8454

}

8455

8456

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

8457

SmallVectorImpl<int> &Mask,

8458

const SelectionDAG &DAG, unsigned Depth = 0,

8459

bool ResolveKnownElts = true) {

8460

EVT VT = Op.getValueType();

8461

if (!VT.isSimple() || !VT.isVector())

8462

return false;

8463

8464

APInt KnownUndef, KnownZero;

8465

unsigned NumElts = Op.getValueType().getVectorNumElements();

8466

APInt DemandedElts = APInt::getAllOnes(NumElts);

8467

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

8468

KnownZero, DAG, Depth, ResolveKnownElts);

8469

}

8470

8471

// Attempt to create a scalar/subvector broadcast from the base MemSDNode.

8472

static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

8473

EVT MemVT, MemSDNode *Mem, unsigned Offset,

8474

SelectionDAG &DAG) {

8475

assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))

8476

Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__))

8477

"Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8477, __extension__
__PRETTY_FUNCTION__));

8478

8479

// Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

8480

if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

8481

return SDValue();

8482

8483

SDValue Ptr =

8484

DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);

8485

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8486

SDValue Ops[] = {Mem->getChain(), Ptr};

8487

SDValue BcstLd = DAG.getMemIntrinsicNode(

8488

Opcode, DL, Tys, Ops, MemVT,

8489

DAG.getMachineFunction().getMachineMemOperand(

8490

Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

8491

DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

8492

return BcstLd;

8493

}

8494

8495

/// Returns the scalar element that will make up the i'th

8496

/// element of the result of the vector shuffle.

8497

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

8498

SelectionDAG &DAG, unsigned Depth) {

8499

if (Depth >= SelectionDAG::MaxRecursionDepth)

8500

return SDValue(); // Limit search depth.

8501

8502

EVT VT = Op.getValueType();

8503

unsigned Opcode = Op.getOpcode();

8504

unsigned NumElems = VT.getVectorNumElements();

8505

8506

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

8507

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

8508

int Elt = SV->getMaskElt(Index);

8509

8510

if (Elt < 0)

8511

return DAG.getUNDEF(VT.getVectorElementType());

8512

8513

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

8514

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8515

}

8516

8517

// Recurse into target specific vector shuffles to find scalars.

8518

if (isTargetShuffle(Opcode)) {

8519

MVT ShufVT = VT.getSimpleVT();

8520

MVT ShufSVT = ShufVT.getVectorElementType();

8521

int NumElems = (int)ShufVT.getVectorNumElements();

8522

SmallVector<int, 16> ShuffleMask;

8523

SmallVector<SDValue, 16> ShuffleOps;

8524

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

8525

ShuffleMask))

8526

return SDValue();

8527

8528

int Elt = ShuffleMask[Index];

8529

if (Elt == SM_SentinelZero)

8530

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

8531

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

8532

if (Elt == SM_SentinelUndef)

8533

return DAG.getUNDEF(ShufSVT);

8534

8535

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8535, __extension__
__PRETTY_FUNCTION__));

8536

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

8537

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8538

}

8539

8540

// Recurse into insert_subvector base/sub vector to find scalars.

8541

if (Opcode == ISD::INSERT_SUBVECTOR) {

8542

SDValue Vec = Op.getOperand(0);

8543

SDValue Sub = Op.getOperand(1);

8544

uint64_t SubIdx = Op.getConstantOperandVal(2);

8545

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

8546

8547

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

8548

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

8549

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

8550

}

8551

8552

// Recurse into concat_vectors sub vector to find scalars.

8553

if (Opcode == ISD::CONCAT_VECTORS) {

8554

EVT SubVT = Op.getOperand(0).getValueType();

8555

unsigned NumSubElts = SubVT.getVectorNumElements();

8556

uint64_t SubIdx = Index / NumSubElts;

8557

uint64_t SubElt = Index % NumSubElts;

8558

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

8559

}

8560

8561

// Recurse into extract_subvector src vector to find scalars.

8562

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

8563

SDValue Src = Op.getOperand(0);

8564

uint64_t SrcIdx = Op.getConstantOperandVal(1);

8565

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

8566

}

8567

8568

// We only peek through bitcasts of the same vector width.

8569

if (Opcode == ISD::BITCAST) {

8570

SDValue Src = Op.getOperand(0);

8571

EVT SrcVT = Src.getValueType();

8572

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

8573

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

8574

return SDValue();

8575

}

8576

8577

// Actual nodes that may contain scalar elements

8578

8579

// For insert_vector_elt - either return the index matching scalar or recurse

8580

// into the base vector.

8581

if (Opcode == ISD::INSERT_VECTOR_ELT &&

8582

isa<ConstantSDNode>(Op.getOperand(2))) {

8583

if (Op.getConstantOperandAPInt(2) == Index)

8584

return Op.getOperand(1);

8585

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

8586

}

8587

8588

if (Opcode == ISD::SCALAR_TO_VECTOR)

8589

return (Index == 0) ? Op.getOperand(0)

8590

: DAG.getUNDEF(VT.getVectorElementType());

8591

8592

if (Opcode == ISD::BUILD_VECTOR)

8593

return Op.getOperand(Index);

8594

8595

return SDValue();

8596

}

8597

8598

// Use PINSRB/PINSRW/PINSRD to create a build vector.

8599

static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,

8600

unsigned NumNonZero, unsigned NumZero,

8601

SelectionDAG &DAG,

8602

const X86Subtarget &Subtarget) {

8603

MVT VT = Op.getSimpleValueType();

8604

unsigned NumElts = VT.getVectorNumElements();

8605

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))

8606

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__))

8607

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8607, __extension__
__PRETTY_FUNCTION__));

8608

8609

SDLoc dl(Op);

8610

SDValue V;

8611

bool First = true;

8612

8613

for (unsigned i = 0; i < NumElts; ++i) {

8614

bool IsNonZero = NonZeroMask[i];

8615

if (!IsNonZero)

8616

continue;

8617

8618

// If the build vector contains zeros or our first insertion is not the

8619

// first index then insert into zero vector to break any register

8620

// dependency else use SCALAR_TO_VECTOR.

8621

if (First) {

8622

First = false;

8623

if (NumZero || 0 != i)

8624

V = getZeroVector(VT, Subtarget, DAG, dl);

8625

else {

8626

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8626, __extension__
__PRETTY_FUNCTION__));

8627

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

8628

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

8629

V = DAG.getBitcast(VT, V);

8630

continue;

8631

}

8632

}

8633

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

8634

DAG.getIntPtrConstant(i, dl));

8635

}

8636

8637

return V;

8638

}

8639

8640

/// Custom lower build_vector of v16i8.

8641

static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,

8642

unsigned NumNonZero, unsigned NumZero,

8643

SelectionDAG &DAG,

8644

const X86Subtarget &Subtarget) {

8645

if (NumNonZero > 8 && !Subtarget.hasSSE41())

8646

return SDValue();

8647

8648

// SSE4.1 - use PINSRB to insert each byte directly.

8649

if (Subtarget.hasSSE41())

8650

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

8651

Subtarget);

8652

8653

SDLoc dl(Op);

8654

SDValue V;

8655

8656

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

8657

for (unsigned i = 0; i < 16; i += 2) {

8658

bool ThisIsNonZero = NonZeroMask[i];

8659

bool NextIsNonZero = NonZeroMask[i + 1];

8660

if (!ThisIsNonZero && !NextIsNonZero)

8661

continue;

8662

8663

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

8664

SDValue Elt;

8665

if (ThisIsNonZero) {

8666

if (NumZero || NextIsNonZero)

8667

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

8668

else

8669

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

8670

}

8671

8672

if (NextIsNonZero) {

8673

SDValue NextElt = Op.getOperand(i + 1);

8674

if (i == 0 && NumZero)

8675

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

8676

else

8677

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

8678

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

8679

DAG.getConstant(8, dl, MVT::i8));

8680

if (ThisIsNonZero)

8681

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

8682

else

8683

Elt = NextElt;

8684

}

8685

8686

// If our first insertion is not the first index or zeros are needed, then

8687

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

8688

// elements undefined).

8689

if (!V) {

8690

if (i != 0 || NumZero)

8691

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

8692

else {

8693

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

8694

V = DAG.getBitcast(MVT::v8i16, V);

8695

continue;

8696

}

8697

}

8698

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

8699

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

8700

DAG.getIntPtrConstant(i / 2, dl));

8701

}

8702

8703

return DAG.getBitcast(MVT::v16i8, V);

8704

}

8705

8706

/// Custom lower build_vector of v8i16.

8707

static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,

8708

unsigned NumNonZero, unsigned NumZero,

8709

SelectionDAG &DAG,

8710

const X86Subtarget &Subtarget) {

8711

if (NumNonZero > 4 && !Subtarget.hasSSE41())

8712

return SDValue();

8713

8714

// Use PINSRW to insert each byte directly.

8715

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

8716

Subtarget);

8717

}

8718

8719

/// Custom lower build_vector of v4i32 or v4f32.

8720

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

8721

const X86Subtarget &Subtarget) {

8722

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

8723

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

8724

// Because we're creating a less complicated build vector here, we may enable

8725

// further folding of the MOVDDUP via shuffle transforms.

8726

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

8727

Op.getOperand(0) == Op.getOperand(2) &&

8728

Op.getOperand(1) == Op.getOperand(3) &&

8729

Op.getOperand(0) != Op.getOperand(1)) {

8730

SDLoc DL(Op);

8731

MVT VT = Op.getSimpleValueType();

8732

MVT EltVT = VT.getVectorElementType();

8733

// Create a new build vector with the first 2 elements followed by undef

8734

// padding, bitcast to v2f64, duplicate, and bitcast back.

8735

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

8736

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

8737

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

8738

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

8739

return DAG.getBitcast(VT, Dup);

8740

}

8741

8742

// Find all zeroable elements.

8743

std::bitset<4> Zeroable, Undefs;

8744

for (int i = 0; i < 4; ++i) {

8745

SDValue Elt = Op.getOperand(i);

8746

Undefs[i] = Elt.isUndef();

8747

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

8748

}

8749

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8750, __extension__
__PRETTY_FUNCTION__))

8750

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8750, __extension__
__PRETTY_FUNCTION__));

8751

8752

// We only know how to deal with build_vector nodes where elements are either

8753

// zeroable or extract_vector_elt with constant index.

8754

SDValue FirstNonZero;

8755

unsigned FirstNonZeroIdx;

8756

for (unsigned i = 0; i < 4; ++i) {

8757

if (Zeroable[i])

8758

continue;

8759

SDValue Elt = Op.getOperand(i);

8760

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8761

!isa<ConstantSDNode>(Elt.getOperand(1)))

8762

return SDValue();

8763

// Make sure that this node is extracting from a 128-bit vector.

8764

MVT VT = Elt.getOperand(0).getSimpleValueType();

8765

if (!VT.is128BitVector())

8766

return SDValue();

8767

if (!FirstNonZero.getNode()) {

8768

FirstNonZero = Elt;

8769

FirstNonZeroIdx = i;

8770

}

8771

}

8772

8773

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8773, __extension__
__PRETTY_FUNCTION__));

8774

SDValue V1 = FirstNonZero.getOperand(0);

8775

MVT VT = V1.getSimpleValueType();

8776

8777

// See if this build_vector can be lowered as a blend with zero.

8778

SDValue Elt;

8779

unsigned EltMaskIdx, EltIdx;

8780

int Mask[4];

8781

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

8782

if (Zeroable[EltIdx]) {

8783

// The zero vector will be on the right hand side.

8784

Mask[EltIdx] = EltIdx+4;

8785

continue;

8786

}

8787

8788

Elt = Op->getOperand(EltIdx);

8789

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

8790

EltMaskIdx = Elt.getConstantOperandVal(1);

8791

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

8792

break;

8793

Mask[EltIdx] = EltIdx;

8794

}

8795

8796

if (EltIdx == 4) {

8797

// Let the shuffle legalizer deal with blend operations.

8798

SDValue VZeroOrUndef = (Zeroable == Undefs)

8799

? DAG.getUNDEF(VT)

8800

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

8801

if (V1.getSimpleValueType() != VT)

8802

V1 = DAG.getBitcast(VT, V1);

8803

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

8804

}

8805

8806

// See if we can lower this build_vector to a INSERTPS.

8807

if (!Subtarget.hasSSE41())

8808

return SDValue();

8809

8810

SDValue V2 = Elt.getOperand(0);

8811

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

8812

V1 = SDValue();

8813

8814

bool CanFold = true;

8815

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

8816

if (Zeroable[i])

8817

continue;

8818

8819

SDValue Current = Op->getOperand(i);

8820

SDValue SrcVector = Current->getOperand(0);

8821

if (!V1.getNode())

8822

V1 = SrcVector;

8823

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

8824

}

8825

8826

if (!CanFold)

8827

return SDValue();

8828

8829

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8829, __extension__
__PRETTY_FUNCTION__));

8830

if (V1.getSimpleValueType() != MVT::v4f32)

8831

V1 = DAG.getBitcast(MVT::v4f32, V1);

8832

if (V2.getSimpleValueType() != MVT::v4f32)

8833

V2 = DAG.getBitcast(MVT::v4f32, V2);

8834

8835

// Ok, we can emit an INSERTPS instruction.

8836

unsigned ZMask = Zeroable.to_ulong();

8837

8838

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

8839

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8839, __extension__
__PRETTY_FUNCTION__));

8840

SDLoc DL(Op);

8841

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

8842

DAG.getIntPtrConstant(InsertPSMask, DL, true));

8843

return DAG.getBitcast(VT, Result);

8844

}

8845

8846

/// Return a vector logical shift node.

8847

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

8848

SelectionDAG &DAG, const TargetLowering &TLI,

8849

const SDLoc &dl) {

8850

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__));

8851

MVT ShVT = MVT::v16i8;

8852

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

8853

SrcOp = DAG.getBitcast(ShVT, SrcOp);

8854

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8854, __extension__
__PRETTY_FUNCTION__));

8855

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

8856

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

8857

}

8858

8859

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

8860

SelectionDAG &DAG) {

8861

8862

// Check if the scalar load can be widened into a vector load. And if

8863

// the address is "base + cst" see if the cst can be "absorbed" into

8864

// the shuffle mask.

8865

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

8866

SDValue Ptr = LD->getBasePtr();

8867

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

8868

return SDValue();

8869

EVT PVT = LD->getValueType(0);

8870

if (PVT != MVT::i32 && PVT != MVT::f32)

8871

return SDValue();

8872

8873

int FI = -1;

8874

int64_t Offset = 0;

8875

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

8876

FI = FINode->getIndex();

8877

Offset = 0;

8878

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

8879

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

8880

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

8881

Offset = Ptr.getConstantOperandVal(1);

8882

Ptr = Ptr.getOperand(0);

8883

} else {

8884

return SDValue();

8885

}

8886

8887

// FIXME: 256-bit vector instructions don't require a strict alignment,

8888

// improve this code to support it better.

8889

Align RequiredAlign(VT.getSizeInBits() / 8);

8890

SDValue Chain = LD->getChain();

8891

// Make sure the stack object alignment is at least 16 or 32.

8892

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

8893

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

8894

if (!InferredAlign || *InferredAlign < RequiredAlign) {

8895

if (MFI.isFixedObjectIndex(FI)) {

8896

// Can't change the alignment. FIXME: It's possible to compute

8897

// the exact stack offset and reference FI + adjust offset instead.

8898

// If someone *really* cares about this. That's the way to implement it.

8899

return SDValue();

8900

} else {

8901

MFI.setObjectAlignment(FI, RequiredAlign);

8902

}

8903

}

8904

8905

// (Offset % 16 or 32) must be multiple of 4. Then address is then

8906

// Ptr + (Offset & ~15).

8907

if (Offset < 0)

8908

return SDValue();

8909

if ((Offset % RequiredAlign.value()) & 3)

8910

return SDValue();

8911

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

8912

if (StartOffset) {

8913

SDLoc DL(Ptr);

8914

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

8915

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

8916

}

8917

8918

int EltNo = (Offset - StartOffset) >> 2;

8919

unsigned NumElems = VT.getVectorNumElements();

8920

8921

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

8922

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

8923

LD->getPointerInfo().getWithOffset(StartOffset));

8924

8925

SmallVector<int, 8> Mask(NumElems, EltNo);

8926

8927

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

8928

}

8929

8930

return SDValue();

8931

}

8932

8933

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

8934

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

8935

if (ISD::isNON_EXTLoad(Elt.getNode())) {

8936

auto *BaseLd = cast<LoadSDNode>(Elt);

8937

if (!BaseLd->isSimple())

8938

return false;

8939

Ld = BaseLd;

8940

ByteOffset = 0;

8941

return true;

8942

}

8943

8944

switch (Elt.getOpcode()) {

8945

case ISD::BITCAST:

8946

case ISD::TRUNCATE:

8947

case ISD::SCALAR_TO_VECTOR:

8948

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

8949

case ISD::SRL:

8950

if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

8951

uint64_t Amt = AmtC->getZExtValue();

8952

if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

8953

ByteOffset += Amt / 8;

8954

return true;

8955

}

8956

}

8957

break;

8958

case ISD::EXTRACT_VECTOR_ELT:

8959

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

8960

SDValue Src = Elt.getOperand(0);

8961

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

8962

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

8963

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

8964

findEltLoadSrc(Src, Ld, ByteOffset)) {

8965

uint64_t Idx = IdxC->getZExtValue();

8966

ByteOffset += Idx * (SrcSizeInBits / 8);

8967

return true;

8968

}

8969

}

8970

break;

8971

}

8972

8973

return false;

8974

}

8975

8976

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

8977

/// elements can be replaced by a single large load which has the same value as

8978

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

8979

///

8980

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

8981

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

8982

const SDLoc &DL, SelectionDAG &DAG,

8983

const X86Subtarget &Subtarget,

8984

bool IsAfterLegalize) {

8985

if ((VT.getScalarSizeInBits() % 8) != 0)

8986

return SDValue();

8987

8988

unsigned NumElems = Elts.size();

8989

8990

int LastLoadedElt = -1;

8991

APInt LoadMask = APInt::getZero(NumElems);

8992

APInt ZeroMask = APInt::getZero(NumElems);

8993

APInt UndefMask = APInt::getZero(NumElems);

8994

8995

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

8996

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

8997

8998

// For each element in the initializer, see if we've found a load, zero or an

8999

// undef.

9000

for (unsigned i = 0; i < NumElems; ++i) {

9001

SDValue Elt = peekThroughBitcasts(Elts[i]);

9002

if (!Elt.getNode())

9003

return SDValue();

9004

if (Elt.isUndef()) {

9005

UndefMask.setBit(i);

9006

continue;

9007

}

9008

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

9009

ZeroMask.setBit(i);

9010

continue;

9011

}

9012

9013

// Each loaded element must be the correct fractional portion of the

9014

// requested vector load.

9015

unsigned EltSizeInBits = Elt.getValueSizeInBits();

9016

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

9017

return SDValue();

9018

9019

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

9020

return SDValue();

9021

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

9022

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

9023

return SDValue();

9024

9025

LoadMask.setBit(i);

9026

LastLoadedElt = i;

9027

}

9028

assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))

9029

LoadMask.countPopulation()) == NumElems &&(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__))

9030

"Incomplete element masks")(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9030, __extension__
__PRETTY_FUNCTION__));

9031

9032

// Handle Special Cases - all undef or undef/zero.

9033

if (UndefMask.countPopulation() == NumElems)

9034

return DAG.getUNDEF(VT);

9035

if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)

9036

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

9037

: DAG.getConstantFP(0.0, DL, VT);

9038

9039

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9040

int FirstLoadedElt = LoadMask.countTrailingZeros();

9041

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

9042

EVT EltBaseVT = EltBase.getValueType();

9043

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9044, __extension__
__PRETTY_FUNCTION__))

9044

"Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9044, __extension__
__PRETTY_FUNCTION__));

9045

LoadSDNode *LDBase = Loads[FirstLoadedElt];

9046

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9046, __extension__
__PRETTY_FUNCTION__));

9047

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

9048

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

9049

int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

9050

int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

9051

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9051, __extension__
__PRETTY_FUNCTION__));

9052

9053

// TODO: Support offsetting the base load.

9054

if (ByteOffsets[FirstLoadedElt] != 0)

9055

return SDValue();

9056

9057

// Check to see if the element's load is consecutive to the base load

9058

// or offset from a previous (already checked) load.

9059

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

9060

LoadSDNode *Ld = Loads[EltIdx];

9061

int64_t ByteOffset = ByteOffsets[EltIdx];

9062

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

9063

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

9064

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

9065

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

9066

}

9067

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

9068

EltIdx - FirstLoadedElt);

9069

};

9070

9071

// Consecutive loads can contain UNDEFS but not ZERO elements.

9072

// Consecutive loads with UNDEFs and ZEROs elements require a

9073

// an additional shuffle stage to clear the ZERO elements.

9074

bool IsConsecutiveLoad = true;

9075

bool IsConsecutiveLoadWithZeros = true;

9076

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

9077

if (LoadMask[i]) {

9078

if (!CheckConsecutiveLoad(LDBase, i)) {

9079

IsConsecutiveLoad = false;

9080

IsConsecutiveLoadWithZeros = false;

9081

break;

9082

}

9083

} else if (ZeroMask[i]) {

9084

IsConsecutiveLoad = false;

9085

}

9086

}

9087

9088

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

9089

auto MMOFlags = LDBase->getMemOperand()->getFlags();

9090

assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9091, __extension__
__PRETTY_FUNCTION__))

9091

"Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9091, __extension__
__PRETTY_FUNCTION__));

9092

SDValue NewLd =

9093

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

9094

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

9095

MMOFlags);

9096

for (auto *LD : Loads)

9097

if (LD)

9098

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

9099

return NewLd;

9100

};

9101

9102

// Check if the base load is entirely dereferenceable.

9103

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

9104

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

9105

9106

// LOAD - all consecutive load/undefs (must start/end with a load or be

9107

// entirely dereferenceable). If we have found an entire vector of loads and

9108

// undefs, then return a large load of the entire vector width starting at the

9109

// base pointer. If the vector contains zeros, then attempt to shuffle those

9110

// elements.

9111

if (FirstLoadedElt == 0 &&

9112

(NumLoadedElts == (int)NumElems || IsDereferenceable) &&

9113

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

9114

if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

9115

return SDValue();

9116

9117

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

9118

// will lower to regular temporal loads and use the cache.

9119

if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&

9120

VT.is256BitVector() && !Subtarget.hasInt256())

9121

return SDValue();

9122

9123

if (NumElems == 1)

9124

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

9125

9126

if (!ZeroMask)

9127

return CreateLoad(VT, LDBase);

9128

9129

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

9130

// vector and a zero vector to clear out the zero elements.

9131

if (!IsAfterLegalize && VT.isVector()) {

9132

unsigned NumMaskElts = VT.getVectorNumElements();

9133

if ((NumMaskElts % NumElems) == 0) {

9134

unsigned Scale = NumMaskElts / NumElems;

9135

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

9136

for (unsigned i = 0; i < NumElems; ++i) {

9137

if (UndefMask[i])

9138

continue;

9139

int Offset = ZeroMask[i] ? NumMaskElts : 0;

9140

for (unsigned j = 0; j != Scale; ++j)

9141

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

9142

}

9143

SDValue V = CreateLoad(VT, LDBase);

9144

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

9145

: DAG.getConstantFP(0.0, DL, VT);

9146

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

9147

}

9148

}

9149

}

9150

9151

// If the upper half of a ymm/zmm load is undef then just load the lower half.

9152

if (VT.is256BitVector() || VT.is512BitVector()) {

9153

unsigned HalfNumElems = NumElems / 2;

9154

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

9155

EVT HalfVT =

9156

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

9157

SDValue HalfLD =

9158

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

9159

DAG, Subtarget, IsAfterLegalize);

9160

if (HalfLD)

9161

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

9162

HalfLD, DAG.getIntPtrConstant(0, DL));

9163

}

9164

}

9165

9166

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

9167

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

9168

((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

9169

LoadSizeInBits == 64) &&

9170

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

9171

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

9172

: MVT::getIntegerVT(LoadSizeInBits);

9173

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

9174

// Allow v4f32 on SSE1 only targets.

9175

// FIXME: Add more isel patterns so we can just use VT directly.

9176

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

9177

VecVT = MVT::v4f32;

9178

if (TLI.isTypeLegal(VecVT)) {

9179

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

9180

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

9181

SDValue ResNode = DAG.getMemIntrinsicNode(

9182

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

9183

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

9184

for (auto *LD : Loads)

9185

if (LD)

9186

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

9187

return DAG.getBitcast(VT, ResNode);

9188

}

9189

}

9190

9191

// BROADCAST - match the smallest possible repetition pattern, load that

9192

// scalar/subvector element and then broadcast to the entire vector.

9193

if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

9194

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

9195

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

9196

unsigned RepeatSize = SubElems * BaseSizeInBits;

9197

unsigned ScalarSize = std::min(RepeatSize, 64u);

9198

if (!Subtarget.hasAVX2() && ScalarSize < 32)

9199

continue;

9200

9201

// Don't attempt a 1:N subvector broadcast - it should be caught by

9202

// combineConcatVectorOps, else will cause infinite loops.

9203

if (RepeatSize > ScalarSize && SubElems == 1)

9204

continue;

9205

9206

bool Match = true;

9207

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

9208

for (unsigned i = 0; i != NumElems && Match; ++i) {

9209

if (!LoadMask[i])

9210

continue;

9211

SDValue Elt = peekThroughBitcasts(Elts[i]);

9212

if (RepeatedLoads[i % SubElems].isUndef())

9213

RepeatedLoads[i % SubElems] = Elt;

9214

else

9215

Match &= (RepeatedLoads[i % SubElems] == Elt);

9216

}

9217

9218

// We must have loads at both ends of the repetition.

9219

Match &= !RepeatedLoads.front().isUndef();

9220

Match &= !RepeatedLoads.back().isUndef();

9221

if (!Match)

9222

continue;

9223

9224

EVT RepeatVT =

9225

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

9226

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

9227

: EVT::getFloatingPointVT(ScalarSize);

9228

if (RepeatSize > ScalarSize)

9229

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

9230

RepeatSize / ScalarSize);

9231

EVT BroadcastVT =

9232

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

9233

VT.getSizeInBits() / ScalarSize);

9234

if (TLI.isTypeLegal(BroadcastVT)) {

9235

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

9236

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

9237

SDValue Broadcast = RepeatLoad;

9238

if (RepeatSize > ScalarSize) {

9239

while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

9240

Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

9241

} else {

9242

if (!Subtarget.hasAVX2() &&

9243

!X86::mayFoldLoadIntoBroadcastFromMem(

9244

RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

9245

Subtarget,

9246

/*AssumeSingleUse=*/true))

9247

return SDValue();

9248

Broadcast =

9249

DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

9250

}

9251

return DAG.getBitcast(VT, Broadcast);

9252

}

9253

}

9254

}

9255

}

9256

9257

return SDValue();

9258

}

9259

9260

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

9261

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

9262

// are consecutive, non-overlapping, and in the right order.

9263

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

9264

SelectionDAG &DAG,

9265

const X86Subtarget &Subtarget,

9266

bool IsAfterLegalize) {

9267

SmallVector<SDValue, 64> Elts;

9268

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

9269

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

9270

Elts.push_back(Elt);

9271

continue;

9272

}

9273

return SDValue();

9274

}

9275

assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9275, __extension__
__PRETTY_FUNCTION__));

9276

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

9277

IsAfterLegalize);

9278

}

9279

9280

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

9281

unsigned SplatBitSize, LLVMContext &C) {

9282

unsigned ScalarSize = VT.getScalarSizeInBits();

9283

unsigned NumElm = SplatBitSize / ScalarSize;

9284

9285

SmallVector<Constant *, 32> ConstantVec;

9286

for (unsigned i = 0; i < NumElm; i++) {

9287

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

9288

Constant *Const;

9289

if (VT.isFloatingPoint()) {

9290

if (ScalarSize == 16) {

9291

Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

9292

} else if (ScalarSize == 32) {

9293

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

9294

} else {

9295

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9295, __extension__
__PRETTY_FUNCTION__));

9296

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

9297

}

9298

} else

9299

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

9300

ConstantVec.push_back(Const);

9301

}

9302

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

9303

}

9304

9305

static bool isFoldableUseOfShuffle(SDNode *N) {

9306

for (auto *U : N->uses()) {

9307

unsigned Opc = U->getOpcode();

9308

// VPERMV/VPERMV3 shuffles can never fold their index operands.

9309

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

9310

return false;

9311

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

9312

return false;

9313

if (isTargetShuffle(Opc))

9314

return true;

9315

if (Opc == ISD::BITCAST) // Ignore bitcasts

9316

return isFoldableUseOfShuffle(U);

9317

if (N->hasOneUse()) {

9318

// TODO, there may be some general way to know if a SDNode can

9319

// be folded. We now only know whether an MI is foldable.

9320

if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

9321

return false;

9322

return true;

9323

}

9324

}

9325

return false;

9326

}

9327

9328

/// Attempt to use the vbroadcast instruction to generate a splat value

9329

/// from a splat BUILD_VECTOR which uses:

9330

/// a. A single scalar load, or a constant.

9331

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

9332

///

9333

/// The VBROADCAST node is returned when a pattern is found,

9334

/// or SDValue() otherwise.

9335

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

9336

const X86Subtarget &Subtarget,

9337

SelectionDAG &DAG) {

9338

// VBROADCAST requires AVX.

9339

// TODO: Splats could be generated for non-AVX CPUs using SSE

9340

// instructions, but there's less potential gain for only 128-bit vectors.

9341

if (!Subtarget.hasAVX())

9342

return SDValue();

9343

9344

MVT VT = BVOp->getSimpleValueType(0);

9345

unsigned NumElts = VT.getVectorNumElements();

9346

SDLoc dl(BVOp);

9347

9348

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9349, __extension__
__PRETTY_FUNCTION__))

9349

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9349, __extension__
__PRETTY_FUNCTION__));

9350

9351

// See if the build vector is a repeating sequence of scalars (inc. splat).

9352

SDValue Ld;

9353

BitVector UndefElements;

9354

SmallVector<SDValue, 16> Sequence;

9355

if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

9356

assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9356, __extension__
__PRETTY_FUNCTION__));

9357

if (Sequence.size() == 1)

9358

Ld = Sequence[0];

9359

}

9360

9361

// Attempt to use VBROADCASTM

9362

// From this pattern:

9363

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

9364

// b. t1 = (build_vector t0 t0)

9365

//

9366

// Create (VBROADCASTM v2i1 X)

9367

if (!Sequence.empty() && Subtarget.hasCDI()) {

9368

// If not a splat, are the upper sequence values zeroable?

9369

unsigned SeqLen = Sequence.size();

9370

bool UpperZeroOrUndef =

9371

SeqLen == 1 ||

9372

llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {

9373

return !V || V.isUndef() || isNullConstant(V);

9374

});

9375

SDValue Op0 = Sequence[0];

9376

if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

9377

(Op0.getOpcode() == ISD::ZERO_EXTEND &&

9378

Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

9379

SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

9380

? Op0.getOperand(0)

9381

: Op0.getOperand(0).getOperand(0);

9382

MVT MaskVT = BOperand.getSimpleValueType();

9383

MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

9384

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

9385

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

9386

MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

9387

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

9388

unsigned Scale = 512 / VT.getSizeInBits();

9389

BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

9390

}

9391

SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

9392

if (BcstVT.getSizeInBits() != VT.getSizeInBits())

9393

Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

9394

return DAG.getBitcast(VT, Bcst);

9395

}

9396

}

9397

}

9398

9399

unsigned NumUndefElts = UndefElements.count();

9400

if (!Ld || (NumElts - NumUndefElts) <= 1) {

9401

APInt SplatValue, Undef;

9402

unsigned SplatBitSize;

9403

bool HasUndef;

9404

// Check if this is a repeated constant pattern suitable for broadcasting.

9405

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

9406

SplatBitSize > VT.getScalarSizeInBits() &&

9407

SplatBitSize < VT.getSizeInBits()) {

9408

// Avoid replacing with broadcast when it's a use of a shuffle

9409

// instruction to preserve the present custom lowering of shuffles.

9410

if (isFoldableUseOfShuffle(BVOp))

9411

return SDValue();

9412

// replace BUILD_VECTOR with broadcast of the repeated constants.

9413

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9414

LLVMContext *Ctx = DAG.getContext();

9415

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

9416

if (Subtarget.hasAVX()) {

9417

if (SplatBitSize == 32 || SplatBitSize == 64 ||

9418

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

9419

// Splatted value can fit in one INTEGER constant in constant pool.

9420

// Load the constant and broadcast it.

9421

MVT CVT = MVT::getIntegerVT(SplatBitSize);

9422

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

9423

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

9424

SDValue CP = DAG.getConstantPool(C, PVT);

9425

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

9426

9427

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9428

SDVTList Tys =

9429

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

9430

SDValue Ops[] = {DAG.getEntryNode(), CP};

9431

MachinePointerInfo MPI =

9432

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9433

SDValue Brdcst = DAG.getMemIntrinsicNode(

9434

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

9435

MachineMemOperand::MOLoad);

9436

return DAG.getBitcast(VT, Brdcst);

9437

}

9438

if (SplatBitSize > 64) {

9439

// Load the vector of constants and broadcast it.

9440

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

9441

*Ctx);

9442

SDValue VCP = DAG.getConstantPool(VecC, PVT);

9443

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

9444

MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

9445

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

9446

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9447

SDValue Ops[] = {DAG.getEntryNode(), VCP};

9448

MachinePointerInfo MPI =

9449

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9450

return DAG.getMemIntrinsicNode(

9451

X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,

9452

MachineMemOperand::MOLoad);

9453

}

9454

}

9455

}

9456

9457

// If we are moving a scalar into a vector (Ld must be set and all elements

9458

// but 1 are undef) and that operation is not obviously supported by

9459

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

9460

// That's better than general shuffling and may eliminate a load to GPR and

9461

// move from scalar to vector register.

9462

if (!Ld || NumElts - NumUndefElts != 1)

9463

return SDValue();

9464

unsigned ScalarSize = Ld.getValueSizeInBits();

9465

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

9466

return SDValue();

9467

}

9468

9469

bool ConstSplatVal =

9470

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

9471

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

9472

9473

// TODO: Handle broadcasts of non-constant sequences.

9474

9475

// Make sure that all of the users of a non-constant load are from the

9476

// BUILD_VECTOR node.

9477

// FIXME: Is the use count needed for non-constant, non-load case?

9478

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

9479

return SDValue();

9480

9481

unsigned ScalarSize = Ld.getValueSizeInBits();

9482

bool IsGE256 = (VT.getSizeInBits() >= 256);

9483

9484

// When optimizing for size, generate up to 5 extra bytes for a broadcast

9485

// instruction to save 8 or more bytes of constant pool data.

9486

// TODO: If multiple splats are generated to load the same constant,

9487

// it may be detrimental to overall size. There needs to be a way to detect

9488

// that condition to know if this is truly a size win.

9489

bool OptForSize = DAG.shouldOptForSize();

9490

9491

// Handle broadcasting a single constant scalar from the constant pool

9492

// into a vector.

9493

// On Sandybridge (no AVX2), it is still better to load a constant vector

9494

// from the constant pool and not to broadcast it from a scalar.

9495

// But override that restriction when optimizing for size.

9496

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

9497

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

9498

EVT CVT = Ld.getValueType();

9499

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9499, __extension__
__PRETTY_FUNCTION__));

9500

9501

// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.

9502

// For size optimization, also splat v2f64 and v2i64, and for size opt

9503

// with AVX2, also splat i8 and i16.

9504

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

9505

if (ScalarSize == 32 ||

9506

(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

9507

(ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||

9508

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

9509

const Constant *C = nullptr;

9510

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

9511

C = CI->getConstantIntValue();

9512

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

9513

C = CF->getConstantFPValue();

9514

9515

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9515, __extension__
__PRETTY_FUNCTION__));

9516

9517

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9518

SDValue CP =

9519

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

9520

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9521

9522

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9523

SDValue Ops[] = {DAG.getEntryNode(), CP};

9524

MachinePointerInfo MPI =

9525

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9526

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

9527

MPI, Alignment, MachineMemOperand::MOLoad);

9528

}

9529

}

9530

9531

// Handle AVX2 in-register broadcasts.

9532

if (!IsLoad && Subtarget.hasInt256() &&

9533

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

9534

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9535

9536

// The scalar source must be a normal load.

9537

if (!IsLoad)

9538

return SDValue();

9539

9540

// Make sure the non-chain result is only used by this build vector.

9541

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

9542

return SDValue();

9543

9544

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

9545

(Subtarget.hasVLX() && ScalarSize == 64)) {

9546

auto *LN = cast<LoadSDNode>(Ld);

9547

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9548

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9549

SDValue BCast =

9550

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9551

LN->getMemoryVT(), LN->getMemOperand());

9552

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9553

return BCast;

9554

}

9555

9556

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

9557

// double since there is no vbroadcastsd xmm

9558

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

9559

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

9560

auto *LN = cast<LoadSDNode>(Ld);

9561

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9562

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9563

SDValue BCast =

9564

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9565

LN->getMemoryVT(), LN->getMemOperand());

9566

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9567

return BCast;

9568

}

9569

9570

if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

9571

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9572

9573

// Unsupported broadcast.

9574

return SDValue();

9575

}

9576

9577

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

9578

/// underlying vector and index.

9579

///

9580

/// Modifies \p ExtractedFromVec to the real vector and returns the real

9581

/// index.

9582

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

9583

SDValue ExtIdx) {

9584

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

9585

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

9586

return Idx;

9587

9588

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

9589

// lowered this:

9590

// (extract_vector_elt (v8f32 %1), Constant<6>)

9591

// to:

9592

// (extract_vector_elt (vector_shuffle<2,u,u,u>

9593

// (extract_subvector (v8f32 %0), Constant<4>),

9594

// undef)

9595

// Constant<0>)

9596

// In this case the vector is the extract_subvector expression and the index

9597

// is 2, as specified by the shuffle.

9598

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

9599

SDValue ShuffleVec = SVOp->getOperand(0);

9600

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

9601

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9602, __extension__
__PRETTY_FUNCTION__))

9602

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9602, __extension__
__PRETTY_FUNCTION__));

9603

9604

int ShuffleIdx = SVOp->getMaskElt(Idx);

9605

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

9606

ExtractedFromVec = ShuffleVec;

9607

return ShuffleIdx;

9608

}

9609

return Idx;

9610

}

9611

9612

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

9613

MVT VT = Op.getSimpleValueType();

9614

9615

// Skip if insert_vec_elt is not supported.

9616

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9617

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

9618

return SDValue();

9619

9620

SDLoc DL(Op);

9621

unsigned NumElems = Op.getNumOperands();

9622

9623

SDValue VecIn1;

9624

SDValue VecIn2;

9625

SmallVector<unsigned, 4> InsertIndices;

9626

SmallVector<int, 8> Mask(NumElems, -1);

9627

9628

for (unsigned i = 0; i != NumElems; ++i) {

9629

unsigned Opc = Op.getOperand(i).getOpcode();

9630

9631

if (Opc == ISD::UNDEF)

9632

continue;

9633

9634

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

9635

// Quit if more than 1 elements need inserting.

9636

if (InsertIndices.size() > 1)

9637

return SDValue();

9638

9639

InsertIndices.push_back(i);

9640

continue;

9641

}

9642

9643

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

9644

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

9645

9646

// Quit if non-constant index.

9647

if (!isa<ConstantSDNode>(ExtIdx))

9648

return SDValue();

9649

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

9650

9651

// Quit if extracted from vector of different type.

9652

if (ExtractedFromVec.getValueType() != VT)

9653

return SDValue();

9654

9655

if (!VecIn1.getNode())

9656

VecIn1 = ExtractedFromVec;

9657

else if (VecIn1 != ExtractedFromVec) {

9658

if (!VecIn2.getNode())

9659

VecIn2 = ExtractedFromVec;

9660

else if (VecIn2 != ExtractedFromVec)

9661

// Quit if more than 2 vectors to shuffle

9662

return SDValue();

9663

}

9664

9665

if (ExtractedFromVec == VecIn1)

9666

Mask[i] = Idx;

9667

else if (ExtractedFromVec == VecIn2)

9668

Mask[i] = Idx + NumElems;

9669

}

9670

9671

if (!VecIn1.getNode())

9672

return SDValue();

9673

9674

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

9675

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

9676

9677

for (unsigned Idx : InsertIndices)

9678

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

9679

DAG.getIntPtrConstant(Idx, DL));

9680

9681

return NV;

9682

}

9683

9684

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

9685

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

9686

const X86Subtarget &Subtarget) {

9687

9688

MVT VT = Op.getSimpleValueType();

9689

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9690, __extension__
__PRETTY_FUNCTION__))

9690

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9690, __extension__
__PRETTY_FUNCTION__));

9691

9692

SDLoc dl(Op);

9693

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

9694

ISD::isBuildVectorAllOnes(Op.getNode()))

9695

return Op;

9696

9697

uint64_t Immediate = 0;

9698

SmallVector<unsigned, 16> NonConstIdx;

9699

bool IsSplat = true;

9700

bool HasConstElts = false;

9701

int SplatIdx = -1;

9702

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

9703

SDValue In = Op.getOperand(idx);

9704

if (In.isUndef())

9705

continue;

9706

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

9707

Immediate |= (InC->getZExtValue() & 0x1) << idx;

9708

HasConstElts = true;

9709

} else {

9710

NonConstIdx.push_back(idx);

9711

}

9712

if (SplatIdx < 0)

9713

SplatIdx = idx;

9714

else if (In != Op.getOperand(SplatIdx))

9715

IsSplat = false;

9716

}

9717

9718

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

9719

if (IsSplat) {

9720

// The build_vector allows the scalar element to be larger than the vector

9721

// element type. We need to mask it to use as a condition unless we know

9722

// the upper bits are zero.

9723

// FIXME: Use computeKnownBits instead of checking specific opcode?

9724

SDValue Cond = Op.getOperand(SplatIdx);

9725

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9725, __extension__
__PRETTY_FUNCTION__));

9726

if (Cond.getOpcode() != ISD::SETCC)

9727

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

9728

DAG.getConstant(1, dl, MVT::i8));

9729

9730

// Perform the select in the scalar domain so we can use cmov.

9731

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

9732

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

9733

DAG.getAllOnesConstant(dl, MVT::i32),

9734

DAG.getConstant(0, dl, MVT::i32));

9735

Select = DAG.getBitcast(MVT::v32i1, Select);

9736

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

9737

} else {

9738

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

9739

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

9740

DAG.getAllOnesConstant(dl, ImmVT),

9741

DAG.getConstant(0, dl, ImmVT));

9742

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

9743

Select = DAG.getBitcast(VecVT, Select);

9744

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

9745

DAG.getIntPtrConstant(0, dl));

9746

}

9747

}

9748

9749

// insert elements one by one

9750

SDValue DstVec;

9751

if (HasConstElts) {

9752

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

9753

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

9754

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

9755

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

9756

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

9757

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

9758

} else {

9759

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

9760

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

9761

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

9762

DstVec = DAG.getBitcast(VecVT, Imm);

9763

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

9764

DAG.getIntPtrConstant(0, dl));

9765

}

9766

} else

9767

DstVec = DAG.getUNDEF(VT);

9768

9769

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

9770

unsigned InsertIdx = NonConstIdx[i];

9771

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

9772

Op.getOperand(InsertIdx),

9773

DAG.getIntPtrConstant(InsertIdx, dl));

9774

}

9775

return DstVec;

9776

}

9777

9778

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {

9779

switch (Opcode) {

9780

case X86ISD::PACKSS:

9781

case X86ISD::PACKUS:

9782

case X86ISD::FHADD:

9783

case X86ISD::FHSUB:

9784

case X86ISD::HADD:

9785

case X86ISD::HSUB:

9786

return true;

9787

}

9788

return false;

9789

}

9790

9791

/// This is a helper function of LowerToHorizontalOp().

9792

/// This function checks that the build_vector \p N in input implements a

9793

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

9794

/// may not match the layout of an x86 256-bit horizontal instruction.

9795

/// In other words, if this returns true, then some extraction/insertion will

9796

/// be required to produce a valid horizontal instruction.

9797

///

9798

/// Parameter \p Opcode defines the kind of horizontal operation to match.

9799

/// For example, if \p Opcode is equal to ISD::ADD, then this function

9800

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

9801

/// is equal to ISD::SUB, then this function checks if this is a horizontal

9802

/// arithmetic sub.

9803

///

9804

/// This function only analyzes elements of \p N whose indices are

9805

/// in range [BaseIdx, LastIdx).

9806

///

9807

/// TODO: This function was originally used to match both real and fake partial

9808

/// horizontal operations, but the index-matching logic is incorrect for that.

9809

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

9810

/// code because it is only used for partial h-op matching now?

9811

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

9812

SelectionDAG &DAG,

9813

unsigned BaseIdx, unsigned LastIdx,

9814

SDValue &V0, SDValue &V1) {

9815

EVT VT = N->getValueType(0);

9816

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9816, __extension__
__PRETTY_FUNCTION__));

9817

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9817, __extension__
__PRETTY_FUNCTION__));

9818

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9819, __extension__
__PRETTY_FUNCTION__))

9819

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9819, __extension__
__PRETTY_FUNCTION__));

9820

9821

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

9822

bool CanFold = true;

9823

unsigned ExpectedVExtractIdx = BaseIdx;

9824

unsigned NumElts = LastIdx - BaseIdx;

9825

V0 = DAG.getUNDEF(VT);

9826

V1 = DAG.getUNDEF(VT);

9827

9828

// Check if N implements a horizontal binop.

9829

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

9830

SDValue Op = N->getOperand(i + BaseIdx);

9831

9832

// Skip UNDEFs.

9833

if (Op->isUndef()) {

9834

// Update the expected vector extract index.

9835

if (i * 2 == NumElts)

9836

ExpectedVExtractIdx = BaseIdx;

9837

ExpectedVExtractIdx += 2;

9838

continue;

9839

}

9840

9841

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

9842

9843

if (!CanFold)

9844

break;

9845

9846

SDValue Op0 = Op.getOperand(0);

9847

SDValue Op1 = Op.getOperand(1);

9848

9849

// Try to match the following pattern:

9850

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

9851

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

9852

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

9853

Op0.getOperand(0) == Op1.getOperand(0) &&

9854

isa<ConstantSDNode>(Op0.getOperand(1)) &&

9855

isa<ConstantSDNode>(Op1.getOperand(1)));

9856

if (!CanFold)

9857

break;

9858

9859

unsigned I0 = Op0.getConstantOperandVal(1);

9860

unsigned I1 = Op1.getConstantOperandVal(1);

9861

9862

if (i * 2 < NumElts) {

9863

if (V0.isUndef()) {

9864

V0 = Op0.getOperand(0);

9865

if (V0.getValueType() != VT)

9866

return false;

9867

}

9868

} else {

9869

if (V1.isUndef()) {

9870

V1 = Op0.getOperand(0);

9871

if (V1.getValueType() != VT)

9872

return false;

9873

}

9874

if (i * 2 == NumElts)

9875

ExpectedVExtractIdx = BaseIdx;

9876

}

9877

9878

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

9879

if (I0 == ExpectedVExtractIdx)

9880

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

9881

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

9882

// Try to match the following dag sequence:

9883

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

9884

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

9885

} else

9886

CanFold = false;

9887

9888

ExpectedVExtractIdx += 2;

9889

}

9890

9891

return CanFold;

9892

}

9893

9894

/// Emit a sequence of two 128-bit horizontal add/sub followed by

9895

/// a concat_vector.

9896

///

9897

/// This is a helper function of LowerToHorizontalOp().

9898

/// This function expects two 256-bit vectors called V0 and V1.

9899

/// At first, each vector is split into two separate 128-bit vectors.

9900

/// Then, the resulting 128-bit vectors are used to implement two

9901

/// horizontal binary operations.

9902

///

9903

/// The kind of horizontal binary operation is defined by \p X86Opcode.

9904

///

9905

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

9906

/// the two new horizontal binop.

9907

/// When Mode is set, the first horizontal binop dag node would take as input

9908

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

9909

/// horizontal binop dag node would take as input the lower 128-bit of V1

9910

/// and the upper 128-bit of V1.

9911

/// Example:

9912

/// HADD V0_LO, V0_HI

9913

/// HADD V1_LO, V1_HI

9914

///

9915

/// Otherwise, the first horizontal binop dag node takes as input the lower

9916

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

9917

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

9918

/// Example:

9919

/// HADD V0_LO, V1_LO

9920

/// HADD V0_HI, V1_HI

9921

///

9922

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

9923

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

9924

/// the upper 128-bits of the result.

9925

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

9926

const SDLoc &DL, SelectionDAG &DAG,

9927

unsigned X86Opcode, bool Mode,

9928

bool isUndefLO, bool isUndefHI) {

9929

MVT VT = V0.getSimpleValueType();

9930

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9931, __extension__
__PRETTY_FUNCTION__))

9931

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9931, __extension__
__PRETTY_FUNCTION__));

9932

9933

unsigned NumElts = VT.getVectorNumElements();

9934

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

9935

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

9936

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

9937

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

9938

MVT NewVT = V0_LO.getSimpleValueType();

9939

9940

SDValue LO = DAG.getUNDEF(NewVT);

9941

SDValue HI = DAG.getUNDEF(NewVT);

9942

9943

if (Mode) {

9944

// Don't emit a horizontal binop if the result is expected to be UNDEF.

9945

if (!isUndefLO && !V0->isUndef())

9946

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

9947

if (!isUndefHI && !V1->isUndef())

9948

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

9949

} else {

9950

// Don't emit a horizontal binop if the result is expected to be UNDEF.

9951

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

9952

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

9953

9954

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

9955

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

9956

}

9957

9958

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

9959

}

9960

9961

/// Returns true iff \p BV builds a vector with the result equivalent to

9962

/// the result of ADDSUB/SUBADD operation.

9963

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

9964

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

9965

/// \p Opnd0 and \p Opnd1.

9966

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

9967

const X86Subtarget &Subtarget, SelectionDAG &DAG,

9968

SDValue &Opnd0, SDValue &Opnd1,

9969

unsigned &NumExtracts,

9970

bool &IsSubAdd) {

9971

9972

MVT VT = BV->getSimpleValueType(0);

9973

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

9974

return false;

9975

9976

unsigned NumElts = VT.getVectorNumElements();

9977

SDValue InVec0 = DAG.getUNDEF(VT);

9978

SDValue InVec1 = DAG.getUNDEF(VT);

9979

9980

NumExtracts = 0;

9981

9982

// Odd-numbered elements in the input build vector are obtained from

9983

// adding/subtracting two integer/float elements.

9984

// Even-numbered elements in the input build vector are obtained from

9985

// subtracting/adding two integer/float elements.

9986

unsigned Opc[2] = {0, 0};

9987

for (unsigned i = 0, e = NumElts; i != e; ++i) {

9988

SDValue Op = BV->getOperand(i);

9989

9990

// Skip 'undef' values.

9991

unsigned Opcode = Op.getOpcode();

9992

if (Opcode == ISD::UNDEF)

9993

continue;

9994

9995

// Early exit if we found an unexpected opcode.

9996

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

9997

return false;

9998

9999

SDValue Op0 = Op.getOperand(0);

10000

SDValue Op1 = Op.getOperand(1);

10001

10002

// Try to match the following pattern:

10003

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

10004

// Early exit if we cannot match that sequence.

10005

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10006

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10007

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10008

Op0.getOperand(1) != Op1.getOperand(1))

10009

return false;

10010

10011

unsigned I0 = Op0.getConstantOperandVal(1);

10012

if (I0 != i)

10013

return false;

10014

10015

// We found a valid add/sub node, make sure its the same opcode as previous

10016

// elements for this parity.

10017

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

10018

return false;

10019

Opc[i % 2] = Opcode;

10020

10021

// Update InVec0 and InVec1.

10022

if (InVec0.isUndef()) {

10023

InVec0 = Op0.getOperand(0);

10024

if (InVec0.getSimpleValueType() != VT)

10025

return false;

10026

}

10027

if (InVec1.isUndef()) {

10028

InVec1 = Op1.getOperand(0);

10029

if (InVec1.getSimpleValueType() != VT)

10030

return false;

10031

}

10032

10033

// Make sure that operands in input to each add/sub node always

10034

// come from a same pair of vectors.

10035

if (InVec0 != Op0.getOperand(0)) {

10036

if (Opcode == ISD::FSUB)

10037

return false;

10038

10039

// FADD is commutable. Try to commute the operands

10040

// and then test again.

10041

std::swap(Op0, Op1);

10042

if (InVec0 != Op0.getOperand(0))

10043

return false;

10044

}

10045

10046

if (InVec1 != Op1.getOperand(0))

10047

return false;

10048

10049

// Increment the number of extractions done.

10050

++NumExtracts;

10051

}

10052

10053

// Ensure we have found an opcode for both parities and that they are

10054

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

10055

// inputs are undef.

10056

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

10057

InVec0.isUndef() || InVec1.isUndef())

10058

return false;

10059

10060

IsSubAdd = Opc[0] == ISD::FADD;

10061

10062

Opnd0 = InVec0;

10063

Opnd1 = InVec1;

10064

return true;

10065

}

10066

10067

/// Returns true if is possible to fold MUL and an idiom that has already been

10068

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

10069

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

10070

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

10071

///

10072

/// Prior to calling this function it should be known that there is some

10073

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

10074

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

10075

/// before replacement of such SDNode with ADDSUB operation. Thus the number

10076

/// of \p Opnd0 uses is expected to be equal to 2.

10077

/// For example, this function may be called for the following IR:

10078

/// %AB = fmul fast <2 x double> %A, %B

10079

/// %Sub = fsub fast <2 x double> %AB, %C

10080

/// %Add = fadd fast <2 x double> %AB, %C

10081

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

10082

/// <2 x i32> <i32 0, i32 3>

10083

/// There is a def for %Addsub here, which potentially can be replaced by

10084

/// X86ISD::ADDSUB operation:

10085

/// %Addsub = X86ISD::ADDSUB %AB, %C

10086

/// and such ADDSUB can further be replaced with FMADDSUB:

10087

/// %Addsub = FMADDSUB %A, %B, %C.

10088

///

10089

/// The main reason why this method is called before the replacement of the

10090

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

10091

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

10092

/// FMADDSUB is.

10093

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

10094

SelectionDAG &DAG,

10095

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

10096

unsigned ExpectedUses) {

10097

if (Opnd0.getOpcode() != ISD::FMUL ||

10098

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

10099

return false;

10100

10101

// FIXME: These checks must match the similar ones in

10102

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

10103

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

10104

// or MUL + ADDSUB to FMADDSUB.

10105

const TargetOptions &Options = DAG.getTarget().Options;

10106

bool AllowFusion =

10107

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

10108

if (!AllowFusion)

10109

return false;

10110

10111

Opnd2 = Opnd1;

10112

Opnd1 = Opnd0.getOperand(1);

10113

Opnd0 = Opnd0.getOperand(0);

10114

10115

return true;

10116

}

10117

10118

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

10119

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

10120

/// X86ISD::FMSUBADD node.

10121

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

10122

const X86Subtarget &Subtarget,

10123

SelectionDAG &DAG) {

10124

SDValue Opnd0, Opnd1;

10125

unsigned NumExtracts;

10126

bool IsSubAdd;

10127

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

10128

IsSubAdd))

10129

return SDValue();

10130

10131

MVT VT = BV->getSimpleValueType(0);

10132

SDLoc DL(BV);

10133

10134

// Try to generate X86ISD::FMADDSUB node here.

10135

SDValue Opnd2;

10136

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

10137

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

10138

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

10139

}

10140

10141

// We only support ADDSUB.

10142

if (IsSubAdd)

10143

return SDValue();

10144

10145

// There are no known X86 targets with 512-bit ADDSUB instructions!

10146

// Convert to blend(fsub,fadd).

10147

if (VT.is512BitVector()) {

10148

SmallVector<int> Mask;

10149

for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

10150

Mask.push_back(I);

10151

Mask.push_back(I + E + 1);

10152

}

10153

SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

10154

SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

10155

return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

10156

}

10157

10158

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

10159

}

10160

10161

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

10162

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

10163

// Initialize outputs to known values.

10164

MVT VT = BV->getSimpleValueType(0);

10165

HOpcode = ISD::DELETED_NODE;

10166

V0 = DAG.getUNDEF(VT);

10167

V1 = DAG.getUNDEF(VT);

10168

10169

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

10170

// half of the result is calculated independently from the 128-bit halves of

10171

// the inputs, so that makes the index-checking logic below more complicated.

10172

unsigned NumElts = VT.getVectorNumElements();

10173

unsigned GenericOpcode = ISD::DELETED_NODE;

10174

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

10175

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

10176

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

10177

for (unsigned i = 0; i != Num128BitChunks; ++i) {

10178

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

10179

// Ignore undef elements.

10180

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

10181

if (Op.isUndef())

10182

continue;

10183

10184

// If there's an opcode mismatch, we're done.

10185

if (HOpcode

5.1	'HOpcode' is equal to DELETED_NODE

!= ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

10186

return false;

10187

10188

// Initialize horizontal opcode.

10189

if (HOpcode

5.2	'HOpcode' is equal to DELETED_NODE

== ISD::DELETED_NODE) {

10190

GenericOpcode = Op.getOpcode();

10191

switch (GenericOpcode) {

10192

case ISD::ADD: HOpcode = X86ISD::HADD; break;

10193

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

10194

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

10195

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

10196

default: return false;

10197

}

10198

}

10199

10200

SDValue Op0 = Op.getOperand(0);

10201

SDValue Op1 = Op.getOperand(1);

10202

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10203

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10204

Op0.getOperand(0) != Op1.getOperand(0) ||

10205

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10206

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

10207

return false;

10208

10209

// The source vector is chosen based on which 64-bit half of the

10210

// destination vector is being calculated.

10211

if (j < NumEltsIn64Bits) {

10212

if (V0.isUndef())

10213

V0 = Op0.getOperand(0);

10214

} else {

10215

if (V1.isUndef())

10216

V1 = Op0.getOperand(0);

10217

}

10218

10219

SDValue SourceVec = (j

16.1	'j' is >= 'NumEltsIn64Bits'

< NumEltsIn64Bits) ? V0 : V1;

10220

if (SourceVec != Op0.getOperand(0))

10221

return false;

10222

10223

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

10224

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

10225

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

10226

unsigned ExpectedIndex = i * NumEltsIn128Bits +

10227

(j % NumEltsIn64Bits) * 2;

10228

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

10229

continue;

10230

10231

// If this is not a commutative op, this does not match.

10232

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

10233

return false;

10234

10235

// Addition is commutative, so try swapping the extract indexes.

10236

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

10237

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

10238

continue;

10239

10240

// Extract indexes do not match horizontal requirement.

10241

return false;

10242

}

10243

}

10244

// We matched. Opcode and operands are returned by reference as arguments.

10245

return true;

10246

}

10247

10248

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

10249

SelectionDAG &DAG, unsigned HOpcode,

10250

SDValue V0, SDValue V1) {

10251

// If either input vector is not the same size as the build vector,

10252

// extract/insert the low bits to the correct size.

10253

// This is free (examples: zmm --> xmm, xmm --> ymm).

10254

MVT VT = BV->getSimpleValueType(0);

10255

unsigned Width = VT.getSizeInBits();

10256

if (V0.getValueSizeInBits() > Width)

10257

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

10258

else if (V0.getValueSizeInBits() < Width)

10259

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

10260

10261

if (V1.getValueSizeInBits() > Width)

10262

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

10263

else if (V1.getValueSizeInBits() < Width)

10264

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

10265

10266

unsigned NumElts = VT.getVectorNumElements();

10267

APInt DemandedElts = APInt::getAllOnes(NumElts);

10268

for (unsigned i = 0; i != NumElts; ++i)

10269

if (BV->getOperand(i).isUndef())

10270

DemandedElts.clearBit(i);

10271

10272

// If we don't need the upper xmm, then perform as a xmm hop.

10273

unsigned HalfNumElts = NumElts / 2;

10274

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

10275

MVT HalfVT = VT.getHalfNumVectorElementsVT();

10276

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

10277

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

10278

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

10279

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

10280

}

10281

10282

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

10283

}

10284

10285

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

10286

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

10287

const X86Subtarget &Subtarget,

10288

SelectionDAG &DAG) {

10289

// We need at least 2 non-undef elements to make this worthwhile by default.

10290

unsigned NumNonUndefs =

10291

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

10292

if (NumNonUndefs < 2)

10293

return SDValue();

10294

10295

// There are 4 sets of horizontal math operations distinguished by type:

10296

// int/FP at 128-bit/256-bit. Each type was introduced with a different

10297

// subtarget feature. Try to match those "native" patterns first.

10298

MVT VT = BV->getSimpleValueType(0);

10299

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

10300

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

10301

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

10302

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

10303

unsigned HOpcode;

10304

SDValue V0, V1;

10305

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

10306

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

10307

}

10308

10309

// Try harder to match 256-bit ops by using extract/concat.

10310

if (!Subtarget.hasAVX() || !VT.is256BitVector())

10311

return SDValue();

10312

10313

// Count the number of UNDEF operands in the build_vector in input.

10314

unsigned NumElts = VT.getVectorNumElements();

10315

unsigned Half = NumElts / 2;

10316

unsigned NumUndefsLO = 0;

10317

unsigned NumUndefsHI = 0;

10318

for (unsigned i = 0, e = Half; i != e; ++i)

10319

if (BV->getOperand(i)->isUndef())

10320

NumUndefsLO++;

10321

10322

for (unsigned i = Half, e = NumElts; i != e; ++i)

10323

if (BV->getOperand(i)->isUndef())

10324

NumUndefsHI++;

10325

10326

SDLoc DL(BV);

10327

SDValue InVec0, InVec1;

10328

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

10329

SDValue InVec2, InVec3;

10330

unsigned X86Opcode;

10331

bool CanFold = true;

10332

10333

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

10334

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

10335

InVec3) &&

10336

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10337

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10338

X86Opcode = X86ISD::HADD;

10339

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

10340

InVec1) &&

10341

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

10342

InVec3) &&

10343

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10344

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10345

X86Opcode = X86ISD::HSUB;

10346

else

10347

CanFold = false;

10348

10349

if (CanFold) {

10350

// Do not try to expand this build_vector into a pair of horizontal

10351

// add/sub if we can emit a pair of scalar add/sub.

10352

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10353

return SDValue();

10354

10355

// Convert this build_vector into a pair of horizontal binops followed by

10356

// a concat vector. We must adjust the outputs from the partial horizontal

10357

// matching calls above to account for undefined vector halves.

10358

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

10359

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

10360

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10360, __extension__
__PRETTY_FUNCTION__));

10361

bool isUndefLO = NumUndefsLO == Half;

10362

bool isUndefHI = NumUndefsHI == Half;

10363

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

10364

isUndefHI);

10365

}

10366

}

10367

10368

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

10369

VT == MVT::v16i16) {

10370

unsigned X86Opcode;

10371

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

10372

X86Opcode = X86ISD::HADD;

10373

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

10374

InVec1))

10375

X86Opcode = X86ISD::HSUB;

10376

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

10377

InVec1))

10378

X86Opcode = X86ISD::FHADD;

10379

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

10380

InVec1))

10381

X86Opcode = X86ISD::FHSUB;

10382

else

10383

return SDValue();

10384

10385

// Don't try to expand this build_vector into a pair of horizontal add/sub

10386

// if we can simply emit a pair of scalar add/sub.

10387

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10388

return SDValue();

10389

10390

// Convert this build_vector into two horizontal add/sub followed by

10391

// a concat vector.

10392

bool isUndefLO = NumUndefsLO == Half;

10393

bool isUndefHI = NumUndefsHI == Half;

10394

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

10395

isUndefLO, isUndefHI);

10396

}

10397

10398

return SDValue();

10399

}

10400

10401

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

10402

SelectionDAG &DAG);

10403

10404

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

10405

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

10406

/// just apply the bit to the vectors.

10407

/// NOTE: Its not in our interest to start make a general purpose vectorizer

10408

/// from this, but enough scalar bit operations are created from the later

10409

/// legalization + scalarization stages to need basic support.

10410

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

10411

const X86Subtarget &Subtarget,

10412

SelectionDAG &DAG) {

10413

SDLoc DL(Op);

10414

MVT VT = Op->getSimpleValueType(0);

10415

unsigned NumElems = VT.getVectorNumElements();

10416

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10417

10418

// Check that all elements have the same opcode.

10419

// TODO: Should we allow UNDEFS and if so how many?

10420

unsigned Opcode = Op->getOperand(0).getOpcode();

10421

for (unsigned i = 1; i < NumElems; ++i)

10422

if (Opcode != Op->getOperand(i).getOpcode())

10423

return SDValue();

10424

10425

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

10426

bool IsShift = false;

10427

switch (Opcode) {

10428

default:

10429

return SDValue();

10430

case ISD::SHL:

10431

case ISD::SRL:

10432

case ISD::SRA:

10433

IsShift = true;

10434

break;

10435

case ISD::AND:

10436

case ISD::XOR:

10437

case ISD::OR:

10438

// Don't do this if the buildvector is a splat - we'd replace one

10439

// constant with an entire vector.

10440

if (Op->getSplatValue())

10441

return SDValue();

10442

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

10443

return SDValue();

10444

break;

10445

}

10446

10447

SmallVector<SDValue, 4> LHSElts, RHSElts;

10448

for (SDValue Elt : Op->ops()) {

10449

SDValue LHS = Elt.getOperand(0);

10450

SDValue RHS = Elt.getOperand(1);

10451

10452

// We expect the canonicalized RHS operand to be the constant.

10453

if (!isa<ConstantSDNode>(RHS))

10454

return SDValue();

10455

10456

// Extend shift amounts.

10457

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

10458

if (!IsShift)

10459

return SDValue();

10460

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

10461

}

10462

10463

LHSElts.push_back(LHS);

10464

RHSElts.push_back(RHS);

10465

}

10466

10467

// Limit to shifts by uniform immediates.

10468

// TODO: Only accept vXi8/vXi64 special cases?

10469

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

10470

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

10471

return SDValue();

10472

10473

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

10474

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

10475

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

10476

10477

if (!IsShift)

10478

return Res;

10479

10480

// Immediately lower the shift to ensure the constant build vector doesn't

10481

// get converted to a constant pool before the shift is lowered.

10482

return LowerShift(Res, Subtarget, DAG);

10483

}

10484

10485

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

10486

/// functionality to do this, so it's all zeros, all ones, or some derivation

10487

/// that is cheap to calculate.

10488

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

10489

const X86Subtarget &Subtarget) {

10490

SDLoc DL(Op);

10491

MVT VT = Op.getSimpleValueType();

10492

10493

// Vectors containing all zeros can be matched by pxor and xorps.

10494

if (ISD::isBuildVectorAllZeros(Op.getNode()))

10495

return Op;

10496

10497

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

10498

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

10499

// vpcmpeqd on 256-bit vectors.

10500

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

10501

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

10502

return Op;

10503

10504

return getOnesVector(VT, DAG, DL);

10505

}

10506

10507

return SDValue();

10508

}

10509

10510

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

10511

/// from a vector of source values and a vector of extraction indices.

10512

/// The vectors might be manipulated to match the type of the permute op.

10513

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

10514

SDLoc &DL, SelectionDAG &DAG,

10515

const X86Subtarget &Subtarget) {

10516

MVT ShuffleVT = VT;

10517

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10518

unsigned NumElts = VT.getVectorNumElements();

10519

unsigned SizeInBits = VT.getSizeInBits();

10520

10521

// Adjust IndicesVec to match VT size.

10522

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10523, __extension__
__PRETTY_FUNCTION__))

10523

"Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10523, __extension__
__PRETTY_FUNCTION__));

10524

if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

10525

// Narrow/widen the indices vector to the correct size.

10526

if (IndicesVec.getValueSizeInBits() > SizeInBits)

10527

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

10528

NumElts * VT.getScalarSizeInBits());

10529

else if (IndicesVec.getValueSizeInBits() < SizeInBits)

10530

IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

10531

SDLoc(IndicesVec), SizeInBits);

10532

// Zero-extend the index elements within the vector.

10533

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

10534

IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

10535

IndicesVT, IndicesVec);

10536

}

10537

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

10538

10539

// Handle SrcVec that don't match VT type.

10540

if (SrcVec.getValueSizeInBits() != SizeInBits) {

10541

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

10542

// Handle larger SrcVec by treating it as a larger permute.

10543

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

10544

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

10545

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10546

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

10547

Subtarget, DAG, SDLoc(IndicesVec));

10548

SDValue NewSrcVec =

10549

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10550

if (NewSrcVec)

10551

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

10552

return SDValue();

10553

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

10554

// Widen smaller SrcVec to match VT.

10555

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

10556

} else

10557

return SDValue();

10558

}

10559

10560

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

10561

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10561, __extension__
__PRETTY_FUNCTION__));

10562

EVT SrcVT = Idx.getValueType();

10563

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

10564

uint64_t IndexScale = 0;

10565

uint64_t IndexOffset = 0;

10566

10567

// If we're scaling a smaller permute op, then we need to repeat the

10568

// indices, scaling and offsetting them as well.

10569

// e.g. v4i32 -> v16i8 (Scale = 4)

10570

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

10571

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

10572

for (uint64_t i = 0; i != Scale; ++i) {

10573

IndexScale |= Scale << (i * NumDstBits);

10574

IndexOffset |= i << (i * NumDstBits);

10575

}

10576

10577

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

10578

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

10579

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

10580

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

10581

return Idx;

10582

};

10583

10584

unsigned Opcode = 0;

10585

switch (VT.SimpleTy) {

10586

default:

10587

break;

10588

case MVT::v16i8:

10589

if (Subtarget.hasSSSE3())

10590

Opcode = X86ISD::PSHUFB;

10591

break;

10592

case MVT::v8i16:

10593

if (Subtarget.hasVLX() && Subtarget.hasBWI())

10594

Opcode = X86ISD::VPERMV;

10595

else if (Subtarget.hasSSSE3()) {

10596

Opcode = X86ISD::PSHUFB;

10597

ShuffleVT = MVT::v16i8;

10598

}

10599

break;

10600

case MVT::v4f32:

10601

case MVT::v4i32:

10602

if (Subtarget.hasAVX()) {

10603

Opcode = X86ISD::VPERMILPV;

10604

ShuffleVT = MVT::v4f32;

10605

} else if (Subtarget.hasSSSE3()) {

10606

Opcode = X86ISD::PSHUFB;

10607

ShuffleVT = MVT::v16i8;

10608

}

10609

break;

10610

case MVT::v2f64:

10611

case MVT::v2i64:

10612

if (Subtarget.hasAVX()) {

10613

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

10614

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

10615

Opcode = X86ISD::VPERMILPV;

10616

ShuffleVT = MVT::v2f64;

10617

} else if (Subtarget.hasSSE41()) {

10618

// SSE41 can compare v2i64 - select between indices 0 and 1.

10619

return DAG.getSelectCC(

10620

DL, IndicesVec,

10621

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

10622

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

10623

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

10624

ISD::CondCode::SETEQ);

10625

}

10626

break;

10627

case MVT::v32i8:

10628

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

10629

Opcode = X86ISD::VPERMV;

10630

else if (Subtarget.hasXOP()) {

10631

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

10632

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

10633

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

10634

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

10635

return DAG.getNode(

10636

ISD::CONCAT_VECTORS, DL, VT,

10637

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

10638

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

10639

} else if (Subtarget.hasAVX()) {

10640

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

10641

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

10642

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

10643

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

10644

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

10645

ArrayRef<SDValue> Ops) {

10646

// Permute Lo and Hi and then select based on index range.

10647

// This works as SHUFB uses bits[3:0] to permute elements and we don't

10648

// care about the bit[7] as its just an index vector.

10649

SDValue Idx = Ops[2];

10650

EVT VT = Idx.getValueType();

10651

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

10652

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

10653

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

10654

ISD::CondCode::SETGT);

10655

};

10656

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

10657

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

10658

PSHUFBBuilder);

10659

}

10660

break;

10661

case MVT::v16i16:

10662

if (Subtarget.hasVLX() && Subtarget.hasBWI())

10663

Opcode = X86ISD::VPERMV;

10664

else if (Subtarget.hasAVX()) {

10665

// Scale to v32i8 and perform as v32i8.

10666

IndicesVec = ScaleIndices(IndicesVec, 2);

10667

return DAG.getBitcast(

10668

VT, createVariablePermute(

10669

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

10670

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

10671

}

10672

break;

10673

case MVT::v8f32:

10674

case MVT::v8i32:

10675

if (Subtarget.hasAVX2())

10676

Opcode = X86ISD::VPERMV;

10677

else if (Subtarget.hasAVX()) {

10678

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

10679

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

10680

{0, 1, 2, 3, 0, 1, 2, 3});

10681

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

10682

{4, 5, 6, 7, 4, 5, 6, 7});

10683

if (Subtarget.hasXOP())

10684

return DAG.getBitcast(

10685

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

10686

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

10687

// Permute Lo and Hi and then select based on index range.

10688

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

10689

SDValue Res = DAG.getSelectCC(

10690

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

10691

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

10692

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

10693

ISD::CondCode::SETGT);

10694

return DAG.getBitcast(VT, Res);

10695

}

10696

break;

10697

case MVT::v4i64:

10698

case MVT::v4f64:

10699

if (Subtarget.hasAVX512()) {

10700

if (!Subtarget.hasVLX()) {

10701

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

10702

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

10703

SDLoc(SrcVec));

10704

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

10705

DAG, SDLoc(IndicesVec));

10706

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

10707

DAG, Subtarget);

10708

return extract256BitVector(Res, 0, DAG, DL);

10709

}

10710

Opcode = X86ISD::VPERMV;

10711

} else if (Subtarget.hasAVX()) {

10712

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

10713

SDValue LoLo =

10714

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

10715

SDValue HiHi =

10716

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

10717

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

10718

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

10719

if (Subtarget.hasXOP())

10720

return DAG.getBitcast(

10721

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

10722

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

10723

// Permute Lo and Hi and then select based on index range.

10724

// This works as VPERMILPD only uses index bit[1] to permute elements.

10725

SDValue Res = DAG.getSelectCC(

10726

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

10727

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

10728

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

10729

ISD::CondCode::SETGT);

10730

return DAG.getBitcast(VT, Res);

10731

}

10732

break;

10733

case MVT::v64i8:

10734

if (Subtarget.hasVBMI())

10735

Opcode = X86ISD::VPERMV;

10736

break;

10737

case MVT::v32i16:

10738

if (Subtarget.hasBWI())

10739

Opcode = X86ISD::VPERMV;

10740

break;

10741

case MVT::v16f32:

10742

case MVT::v16i32:

10743

case MVT::v8f64:

10744

case MVT::v8i64:

10745

if (Subtarget.hasAVX512())

10746

Opcode = X86ISD::VPERMV;

10747

break;

10748

}

10749

if (!Opcode)

10750

return SDValue();

10751

10752

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))

10753

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__))

10754

"Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10754, __extension__
__PRETTY_FUNCTION__));

10755

10756

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

10757

if (Scale > 1)

10758

IndicesVec = ScaleIndices(IndicesVec, Scale);

10759

10760

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

10761

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

10762

10763

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

10764

SDValue Res = Opcode == X86ISD::VPERMV

10765

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

10766

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

10767

return DAG.getBitcast(VT, Res);

10768

}

10769

10770

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

10771

// reasoned to be a permutation of a vector by indices in a non-constant vector.

10772

// (build_vector (extract_elt V, (extract_elt I, 0)),

10773

// (extract_elt V, (extract_elt I, 1)),

10774

// ...

10775

// ->

10776

// (vpermv I, V)

10777

//

10778

// TODO: Handle undefs

10779

// TODO: Utilize pshufb and zero mask blending to support more efficient

10780

// construction of vectors with constant-0 elements.

10781

static SDValue

10782

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

10783

const X86Subtarget &Subtarget) {

10784

SDValue SrcVec, IndicesVec;

10785

// Check for a match of the permute source vector and permute index elements.

10786

// This is done by checking that the i-th build_vector operand is of the form:

10787

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

10788

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

10789

SDValue Op = V.getOperand(Idx);

10790

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

10791

return SDValue();

10792

10793

// If this is the first extract encountered in V, set the source vector,

10794

// otherwise verify the extract is from the previously defined source

10795

// vector.

10796

if (!SrcVec)

10797

SrcVec = Op.getOperand(0);

10798

else if (SrcVec != Op.getOperand(0))

10799

return SDValue();

10800

SDValue ExtractedIndex = Op->getOperand(1);

10801

// Peek through extends.

10802

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

10803

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

10804

ExtractedIndex = ExtractedIndex.getOperand(0);

10805

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

10806

return SDValue();

10807

10808

// If this is the first extract from the index vector candidate, set the

10809

// indices vector, otherwise verify the extract is from the previously

10810

// defined indices vector.

10811

if (!IndicesVec)

10812

IndicesVec = ExtractedIndex.getOperand(0);

10813

else if (IndicesVec != ExtractedIndex.getOperand(0))

10814

return SDValue();

10815

10816

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

10817

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

10818

return SDValue();

10819

}

10820

10821

SDLoc DL(V);

10822

MVT VT = V.getSimpleValueType();

10823

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10824

}

10825

10826

SDValue

10827

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

10828

SDLoc dl(Op);

10829

10830

MVT VT = Op.getSimpleValueType();

10831

MVT EltVT = VT.getVectorElementType();

10832

unsigned NumElems = Op.getNumOperands();

10833

10834

// Generate vectors for predicate vectors.

10835

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

10836

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

10837

10838

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

10839

return VectorConstant;

10840

10841

unsigned EVTBits = EltVT.getSizeInBits();

10842

APInt UndefMask = APInt::getZero(NumElems);

10843

APInt ZeroMask = APInt::getZero(NumElems);

10844

APInt NonZeroMask = APInt::getZero(NumElems);

10845

bool IsAllConstants = true;

10846

SmallSet<SDValue, 8> Values;

10847

unsigned NumConstants = NumElems;

10848

for (unsigned i = 0; i < NumElems; ++i) {

10849

SDValue Elt = Op.getOperand(i);

10850

if (Elt.isUndef()) {

10851

UndefMask.setBit(i);

10852

continue;

10853

}

10854

Values.insert(Elt);

10855

if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {

10856

IsAllConstants = false;

10857

NumConstants--;

10858

}

10859

if (X86::isZeroNode(Elt)) {

10860

ZeroMask.setBit(i);

10861

} else {

10862

NonZeroMask.setBit(i);

10863

}

10864

}

10865

10866

// All undef vector. Return an UNDEF. All zero vectors were handled above.

10867

if (NonZeroMask == 0) {

10868

assert(UndefMask.isAllOnes() && "Fully undef mask expected")(static_cast <bool> (UndefMask.isAllOnes() && "Fully undef mask expected"
) ? void (0) : __assert_fail ("UndefMask.isAllOnes() && \"Fully undef mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10868, __extension__
__PRETTY_FUNCTION__));

10869

return DAG.getUNDEF(VT);

10870

}

10871

10872

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

10873

10874

// If the upper elts of a ymm/zmm are undef/zero then we might be better off

10875

// lowering to a smaller build vector and padding with undef/zero.

10876

if ((VT.is256BitVector() || VT.is512BitVector()) &&

10877

!isFoldableUseOfShuffle(BV)) {

10878

unsigned UpperElems = NumElems / 2;

10879

APInt UndefOrZeroMask = UndefMask | ZeroMask;

10880

unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();

10881

if (NumUpperUndefsOrZeros >= UpperElems) {

10882

if (VT.is512BitVector() &&

10883

NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

10884

UpperElems = NumElems - (NumElems / 4);

10885

bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;

10886

MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

10887

SDValue NewBV =

10888

DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

10889

return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

10890

}

10891

}

10892

10893

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

10894

return AddSub;

10895

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

10896

return HorizontalOp;

10897

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

10898

return Broadcast;

10899

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

10900

return BitOp;

10901

10902

unsigned NumZero = ZeroMask.countPopulation();

10903

unsigned NumNonZero = NonZeroMask.countPopulation();

10904

10905

// If we are inserting one variable into a vector of non-zero constants, try

10906

// to avoid loading each constant element as a scalar. Load the constants as a

10907

// vector and then insert the variable scalar element. If insertion is not

10908

// supported, fall back to a shuffle to get the scalar blended with the

10909

// constants. Insertion into a zero vector is handled as a special-case

10910

// somewhere below here.

10911

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

10912

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

10913

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

10914

// Create an all-constant vector. The variable element in the old

10915

// build vector is replaced by undef in the constant vector. Save the

10916

// variable scalar element and its index for use in the insertelement.

10917

LLVMContext &Context = *DAG.getContext();

10918

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

10919

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

10920

SDValue VarElt;

10921

SDValue InsIndex;

10922

for (unsigned i = 0; i != NumElems; ++i) {

10923

SDValue Elt = Op.getOperand(i);

10924

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

10925

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

10926

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

10927

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

10928

else if (!Elt.isUndef()) {

10929

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10930, __extension__
__PRETTY_FUNCTION__))

10930

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10930, __extension__
__PRETTY_FUNCTION__));

10931

VarElt = Elt;

10932

InsIndex = DAG.getVectorIdxConstant(i, dl);

10933

}

10934

}

10935

Constant *CV = ConstantVector::get(ConstVecOps);

10936

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

10937

10938

// The constants we just created may not be legal (eg, floating point). We

10939

// must lower the vector right here because we can not guarantee that we'll

10940

// legalize it before loading it. This is also why we could not just create

10941

// a new build vector here. If the build vector contains illegal constants,

10942

// it could get split back up into a series of insert elements.

10943

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

10944

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

10945

MachineFunction &MF = DAG.getMachineFunction();

10946

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

10947

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

10948

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

10949

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

10950

if (InsertC < NumEltsInLow128Bits)

10951

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

10952

10953

// There's no good way to insert into the high elements of a >128-bit

10954

// vector, so use shuffles to avoid an extract/insert sequence.

10955

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10955, __extension__
__PRETTY_FUNCTION__));

10956

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10956, __extension__
__PRETTY_FUNCTION__));

10957

SmallVector<int, 8> ShuffleMask;

10958

unsigned NumElts = VT.getVectorNumElements();

10959

for (unsigned i = 0; i != NumElts; ++i)

10960

ShuffleMask.push_back(i == InsertC ? NumElts : i);

10961

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

10962

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

10963

}

10964

10965

// Special case for single non-zero, non-undef, element.

10966

if (NumNonZero == 1) {

10967

unsigned Idx = NonZeroMask.countTrailingZeros();

10968

SDValue Item = Op.getOperand(Idx);

10969

10970

// If we have a constant or non-constant insertion into the low element of

10971

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

10972

// the rest of the elements. This will be matched as movd/movq/movss/movsd

10973

// depending on what the source datatype is.

10974

if (Idx == 0) {

10975

if (NumZero == 0)

10976

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

10977

10978

if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

10979

EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

10980

(EltVT == MVT::i16 && Subtarget.hasFP16())) {

10981

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))

10982

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__))

10983

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10983, __extension__
__PRETTY_FUNCTION__));

10984

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

10985

// Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

10986

// zero vector.

10987

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

10988

}

10989

10990

// We can't directly insert an i8 or i16 into a vector, so zero extend

10991

// it to i32 first.

10992

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

10993

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

10994

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

10995

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

10996

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

10997

return DAG.getBitcast(VT, Item);

10998

}

10999

}

11000

11001

// Is it a vector logical left shift?

11002

if (NumElems == 2 && Idx == 1 &&

11003

X86::isZeroNode(Op.getOperand(0)) &&

11004

!X86::isZeroNode(Op.getOperand(1))) {

11005

unsigned NumBits = VT.getSizeInBits();

11006

return getVShift(true, VT,

11007

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11008

VT, Op.getOperand(1)),

11009

NumBits/2, DAG, *this, dl);

11010

}

11011

11012

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

11013

return SDValue();

11014

11015

// Otherwise, if this is a vector with i32 or f32 elements, and the element

11016

// is a non-constant being inserted into an element other than the low one,

11017

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

11018

// movd/movss) to move this into the low element, then shuffle it into

11019

// place.

11020

if (EVTBits == 32) {

11021

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11022

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

11023

}

11024

}

11025

11026

// Splat is obviously ok. Let legalizer expand it to a shuffle.

11027

if (Values.size() == 1) {

11028

if (EVTBits == 32) {

11029

// Instead of a shuffle like this:

11030

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

11031

// Check if it's possible to issue this instead.

11032

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

11033

unsigned Idx = NonZeroMask.countTrailingZeros();

11034

SDValue Item = Op.getOperand(Idx);

11035

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

11036

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

11037

}

11038

return SDValue();

11039

}

11040

11041

// A vector full of immediates; various special cases are already

11042

// handled, so this is best done with a single constant-pool load.

11043

if (IsAllConstants)

11044

return SDValue();

11045

11046

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

11047

return V;

11048

11049

// See if we can use a vector load to get all of the elements.

11050

{

11051

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

11052

if (SDValue LD =

11053

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

11054

return LD;

11055

}

11056

11057

// If this is a splat of pairs of 32-bit elements, we can use a narrower

11058

// build_vector and broadcast it.

11059

// TODO: We could probably generalize this more.

11060

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

11061

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

11062

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

11063

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

11064

// Make sure all the even/odd operands match.

11065

for (unsigned i = 2; i != NumElems; ++i)

11066

if (Ops[i % 2] != Op.getOperand(i))

11067

return false;

11068

return true;

11069

};

11070

if (CanSplat(Op, NumElems, Ops)) {

11071

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

11072

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

11073

// Create a new build vector and cast to v2i64/v2f64.

11074

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

11075

DAG.getBuildVector(NarrowVT, dl, Ops));

11076

// Broadcast from v2i64/v2f64 and cast to final VT.

11077

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

11078

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

11079

NewBV));

11080

}

11081

}

11082

11083

// For AVX-length vectors, build the individual 128-bit pieces and use

11084

// shuffles to put them in place.

11085

if (VT.getSizeInBits() > 128) {

11086

MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);

11087

11088

// Build both the lower and upper subvector.

11089

SDValue Lower =

11090

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

11091

SDValue Upper = DAG.getBuildVector(

11092

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

11093

11094

// Recreate the wider vector with the lower and upper part.

11095

return concatSubVectors(Lower, Upper, DAG, dl);

11096

}

11097

11098

// Let legalizer expand 2-wide build_vectors.

11099

if (EVTBits == 64) {

11100

if (NumNonZero == 1) {

11101

// One half is zero or undef.

11102

unsigned Idx = NonZeroMask.countTrailingZeros();

11103

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

11104

Op.getOperand(Idx));

11105

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

11106

}

11107

return SDValue();

11108

}

11109

11110

// If element VT is < 32 bits, convert it to inserts into a zero vector.

11111

if (EVTBits == 8 && NumElems == 16)

11112

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,

11113

DAG, Subtarget))

11114

return V;

11115

11116

if (EltVT == MVT::i16 && NumElems == 8)

11117

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,

11118

DAG, Subtarget))

11119

return V;

11120

11121

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

11122

if (EVTBits == 32 && NumElems == 4)

11123

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

11124

return V;

11125

11126

// If element VT is == 32 bits, turn it into a number of shuffles.

11127

if (NumElems == 4 && NumZero > 0) {

11128

SmallVector<SDValue, 8> Ops(NumElems);

11129

for (unsigned i = 0; i < 4; ++i) {

11130

bool isZero = !NonZeroMask[i];

11131

if (isZero)

11132

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

11133

else

11134

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11135

}

11136

11137

for (unsigned i = 0; i < 2; ++i) {

11138

switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

11139

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11139);

11140

case 0:

11141

Ops[i] = Ops[i*2]; // Must be a zero vector.

11142

break;

11143

case 1:

11144

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

11145

break;

11146

case 2:

11147

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11148

break;

11149

case 3:

11150

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11151

break;

11152

}

11153

}

11154

11155

bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

11156

bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

11157

int MaskVec[] = {

11158

Reverse1 ? 1 : 0,

11159

Reverse1 ? 0 : 1,

11160

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

11161

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

11162

};

11163

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

11164

}

11165

11166

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11166, __extension__
__PRETTY_FUNCTION__));

11167

11168

// Check for a build vector from mostly shuffle plus few inserting.

11169

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

11170

return Sh;

11171

11172

// For SSE 4.1, use insertps to put the high elements into the low element.

11173

if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

11174

SDValue Result;

11175

if (!Op.getOperand(0).isUndef())

11176

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

11177

else

11178

Result = DAG.getUNDEF(VT);

11179

11180

for (unsigned i = 1; i < NumElems; ++i) {

11181

if (Op.getOperand(i).isUndef()) continue;

11182

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

11183

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

11184

}

11185

return Result;

11186

}

11187

11188

// Otherwise, expand into a number of unpckl*, start by extending each of

11189

// our (non-undef) elements to the full vector width with the element in the

11190

// bottom slot of the vector (which generates no code for SSE).

11191

SmallVector<SDValue, 8> Ops(NumElems);

11192

for (unsigned i = 0; i < NumElems; ++i) {

11193

if (!Op.getOperand(i).isUndef())

11194

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11195

else

11196

Ops[i] = DAG.getUNDEF(VT);

11197

}

11198

11199

// Next, we iteratively mix elements, e.g. for v4f32:

11200

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

11201

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

11202

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

11203

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

11204

// Generate scaled UNPCKL shuffle mask.

11205

SmallVector<int, 16> Mask;

11206

for(unsigned i = 0; i != Scale; ++i)

11207

Mask.push_back(i);

11208

for (unsigned i = 0; i != Scale; ++i)

11209

Mask.push_back(NumElems+i);

11210

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

11211

11212

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

11213

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

11214

}

11215

return Ops[0];

11216

}

11217

11218

// 256-bit AVX can use the vinsertf128 instruction

11219

// to create 256-bit vectors from two other 128-bit ones.

11220

// TODO: Detect subvector broadcast here instead of DAG combine?

11221

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

11222

const X86Subtarget &Subtarget) {

11223

SDLoc dl(Op);

11224

MVT ResVT = Op.getSimpleValueType();

11225

11226

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11227, __extension__
__PRETTY_FUNCTION__))

11227

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11227, __extension__
__PRETTY_FUNCTION__));

11228

11229

unsigned NumOperands = Op.getNumOperands();

11230

unsigned NumZero = 0;

11231

unsigned NumNonZero = 0;

11232

unsigned NonZeros = 0;

11233

for (unsigned i = 0; i != NumOperands; ++i) {

11234

SDValue SubVec = Op.getOperand(i);

11235

if (SubVec.isUndef())

11236

continue;

11237

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11238

++NumZero;

11239

else {

11240

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11240, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11241

NonZeros |= 1 << i;

11242

++NumNonZero;

11243

}

11244

}

11245

11246

// If we have more than 2 non-zeros, build each half separately.

11247

if (NumNonZero > 2) {

11248

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11249

ArrayRef<SDUse> Ops = Op->ops();

11250

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11251

Ops.slice(0, NumOperands/2));

11252

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11253

Ops.slice(NumOperands/2));

11254

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11255

}

11256

11257

// Otherwise, build it up through insert_subvectors.

11258

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

11259

: DAG.getUNDEF(ResVT);

11260

11261

MVT SubVT = Op.getOperand(0).getSimpleValueType();

11262

unsigned NumSubElems = SubVT.getVectorNumElements();

11263

for (unsigned i = 0; i != NumOperands; ++i) {

11264

if ((NonZeros & (1 << i)) == 0)

11265

continue;

11266

11267

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

11268

Op.getOperand(i),

11269

DAG.getIntPtrConstant(i * NumSubElems, dl));

11270

}

11271

11272

return Vec;

11273

}

11274

11275

// Returns true if the given node is a type promotion (by concatenating i1

11276

// zeros) of the result of a node that already zeros all upper bits of

11277

// k-register.

11278

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

11279

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

11280

const X86Subtarget &Subtarget,

11281

SelectionDAG & DAG) {

11282

SDLoc dl(Op);

11283

MVT ResVT = Op.getSimpleValueType();

11284

unsigned NumOperands = Op.getNumOperands();

11285

11286

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11287, __extension__
__PRETTY_FUNCTION__))

11287

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11287, __extension__
__PRETTY_FUNCTION__));

11288

11289

uint64_t Zeros = 0;

11290

uint64_t NonZeros = 0;

11291

for (unsigned i = 0; i != NumOperands; ++i) {

11292

SDValue SubVec = Op.getOperand(i);

11293

if (SubVec.isUndef())

11294

continue;

11295

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11295, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11296

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11297

Zeros |= (uint64_t)1 << i;

11298

else

11299

NonZeros |= (uint64_t)1 << i;

11300

}

11301

11302

unsigned NumElems = ResVT.getVectorNumElements();

11303

11304

// If we are inserting non-zero vector and there are zeros in LSBs and undef

11305

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

11306

// insert_subvector will give us two kshifts.

11307

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

11308

Log2_64(NonZeros) != NumOperands - 1) {

11309

MVT ShiftVT = ResVT;

11310

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

11311

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

11312

unsigned Idx = Log2_64(NonZeros);

11313

SDValue SubVec = Op.getOperand(Idx);

11314

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11315

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

11316

DAG.getUNDEF(ShiftVT), SubVec,

11317

DAG.getIntPtrConstant(0, dl));

11318

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

11319

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

11320

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

11321

DAG.getIntPtrConstant(0, dl));

11322

}

11323

11324

// If there are zero or one non-zeros we can handle this very simply.

11325

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

11326

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

11327

if (!NonZeros)

11328

return Vec;

11329

unsigned Idx = Log2_64(NonZeros);

11330

SDValue SubVec = Op.getOperand(Idx);

11331

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11332

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

11333

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

11334

}

11335

11336

if (NumOperands > 2) {

11337

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11338

ArrayRef<SDUse> Ops = Op->ops();

11339

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11340

Ops.slice(0, NumOperands/2));

11341

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11342

Ops.slice(NumOperands/2));

11343

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11344

}

11345

11346

assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (countPopulation(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11346, __extension__
__PRETTY_FUNCTION__));

11347

11348

if (ResVT.getVectorNumElements() >= 16)

11349

return Op; // The operation is legal with KUNPCK

11350

11351

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

11352

DAG.getUNDEF(ResVT), Op.getOperand(0),

11353

DAG.getIntPtrConstant(0, dl));

11354

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

11355

DAG.getIntPtrConstant(NumElems/2, dl));

11356

}

11357

11358

static SDValue LowerCONCAT_VECTORS(SDValue Op,

11359

const X86Subtarget &Subtarget,

11360

SelectionDAG &DAG) {

11361

MVT VT = Op.getSimpleValueType();

11362

if (VT.getVectorElementType() == MVT::i1)

11363

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

11364

11365

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))

11366

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__))

11367

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11367, __extension__
__PRETTY_FUNCTION__));

11368

11369

// AVX can use the vinsertf128 instruction to create 256-bit vectors

11370

// from two other 128-bit ones.

11371

11372

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

11373

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

11374

}

11375

11376

//===----------------------------------------------------------------------===//

11377

// Vector shuffle lowering

11378

//

11379

// This is an experimental code path for lowering vector shuffles on x86. It is

11380

// designed to handle arbitrary vector shuffles and blends, gracefully

11381

// degrading performance as necessary. It works hard to recognize idiomatic

11382

// shuffles and lower them to optimal instruction patterns without leaving

11383

// a framework that allows reasonably efficient handling of all vector shuffle

11384

// patterns.

11385

//===----------------------------------------------------------------------===//

11386

11387

/// Tiny helper function to identify a no-op mask.

11388

///

11389

/// This is a somewhat boring predicate function. It checks whether the mask

11390

/// array input, which is assumed to be a single-input shuffle mask of the kind

11391

/// used by the X86 shuffle instructions (not a fully general

11392

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

11393

/// in-place shuffle are 'no-op's.

11394

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

11395

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11396

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11396, __extension__
__PRETTY_FUNCTION__));

11397

if (Mask[i] >= 0 && Mask[i] != i)

11398

return false;

11399

}

11400

return true;

11401

}

11402

11403

/// Test whether there are elements crossing LaneSizeInBits lanes in this

11404

/// shuffle mask.

11405

///

11406

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

11407

/// and we routinely test for these.

11408

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

11409

unsigned ScalarSizeInBits,

11410

ArrayRef<int> Mask) {

11411

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))

11412

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__))

11413

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11413, __extension__
__PRETTY_FUNCTION__));

11414

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

11415

int Size = Mask.size();

11416

for (int i = 0; i < Size; ++i)

11417

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

11418

return true;

11419

return false;

11420

}

11421

11422

/// Test whether there are elements crossing 128-bit lanes in this

11423

/// shuffle mask.

11424

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

11425

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

11426

}

11427

11428

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

11429

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

11430

/// better support 'repeated mask + lane permute' style shuffles.

11431

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

11432

unsigned ScalarSizeInBits,

11433

ArrayRef<int> Mask) {

11434

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))

11435

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__))

11436

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11436, __extension__
__PRETTY_FUNCTION__));

11437

int NumElts = Mask.size();

11438

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

11439

int NumLanes = NumElts / NumEltsPerLane;

11440

if (NumLanes > 1) {

11441

for (int i = 0; i != NumLanes; ++i) {

11442

int SrcLane = -1;

11443

for (int j = 0; j != NumEltsPerLane; ++j) {

11444

int M = Mask[(i * NumEltsPerLane) + j];

11445

if (M < 0)

11446

continue;

11447

int Lane = (M % NumElts) / NumEltsPerLane;

11448

if (SrcLane >= 0 && SrcLane != Lane)

11449

return true;

11450

SrcLane = Lane;

11451

}

11452

}

11453

}

11454

return false;

11455

}

11456

11457

/// Test whether a shuffle mask is equivalent within each sub-lane.

11458

///

11459

/// This checks a shuffle mask to see if it is performing the same

11460

/// lane-relative shuffle in each sub-lane. This trivially implies

11461

/// that it is also not lane-crossing. It may however involve a blend from the

11462

/// same lane of a second vector.

11463

///

11464

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

11465

/// non-trivial to compute in the face of undef lanes. The representation is

11466

/// suitable for use with existing 128-bit shuffles as entries from the second

11467

/// vector have been remapped to [LaneSize, 2*LaneSize).

11468

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

11469

ArrayRef<int> Mask,

11470

SmallVectorImpl<int> &RepeatedMask) {

11471

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

11472

RepeatedMask.assign(LaneSize, -1);

11473

int Size = Mask.size();

11474

for (int i = 0; i < Size; ++i) {

11475

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11475, __extension__
__PRETTY_FUNCTION__));

11476

if (Mask[i] < 0)

11477

continue;

11478

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11479

// This entry crosses lanes, so there is no way to model this shuffle.

11480

return false;

11481

11482

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

11483

// Adjust second vector indices to start at LaneSize instead of Size.

11484

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

11485

: Mask[i] % LaneSize + LaneSize;

11486

if (RepeatedMask[i % LaneSize] < 0)

11487

// This is the first non-undef entry in this slot of a 128-bit lane.

11488

RepeatedMask[i % LaneSize] = LocalM;

11489

else if (RepeatedMask[i % LaneSize] != LocalM)

11490

// Found a mismatch with the repeated mask.

11491

return false;

11492

}

11493

return true;

11494

}

11495

11496

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

11497

static bool

11498

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11499

SmallVectorImpl<int> &RepeatedMask) {

11500

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11501

}

11502

11503

static bool

11504

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

11505

SmallVector<int, 32> RepeatedMask;

11506

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11507

}

11508

11509

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

11510

static bool

11511

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11512

SmallVectorImpl<int> &RepeatedMask) {

11513

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

11514

}

11515

11516

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11517

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11518

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

11519

unsigned EltSizeInBits,

11520

ArrayRef<int> Mask,

11521

SmallVectorImpl<int> &RepeatedMask) {

11522

int LaneSize = LaneSizeInBits / EltSizeInBits;

11523

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

11524

int Size = Mask.size();

11525

for (int i = 0; i < Size; ++i) {

11526

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11526, __extension__
__PRETTY_FUNCTION__));

11527

if (Mask[i] == SM_SentinelUndef)

11528

continue;

11529

if (Mask[i] == SM_SentinelZero) {

11530

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

11531

return false;

11532

RepeatedMask[i % LaneSize] = SM_SentinelZero;

11533

continue;

11534

}

11535

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11536

// This entry crosses lanes, so there is no way to model this shuffle.

11537

return false;

11538

11539

// Handle the in-lane shuffles by detecting if and when they repeat. Adjust

11540

// later vector indices to start at multiples of LaneSize instead of Size.

11541

int LaneM = Mask[i] / Size;

11542

int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

11543

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

11544

// This is the first non-undef entry in this slot of a 128-bit lane.

11545

RepeatedMask[i % LaneSize] = LocalM;

11546

else if (RepeatedMask[i % LaneSize] != LocalM)

11547

// Found a mismatch with the repeated mask.

11548

return false;

11549

}

11550

return true;

11551

}

11552

11553

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11554

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11555

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

11556

ArrayRef<int> Mask,

11557

SmallVectorImpl<int> &RepeatedMask) {

11558

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

11559

Mask, RepeatedMask);

11560

}

11561

11562

/// Checks whether the vector elements referenced by two shuffle masks are

11563

/// equivalent.

11564

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

11565

int Idx, int ExpectedIdx) {

11566

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11567, __extension__
__PRETTY_FUNCTION__))

11567

ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11567, __extension__
__PRETTY_FUNCTION__));

11568

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

11569

return false;

11570

11571

switch (Op.getOpcode()) {

11572

case ISD::BUILD_VECTOR:

11573

// If the values are build vectors, we can look through them to find

11574

// equivalent inputs that make the shuffles equivalent.

11575

// TODO: Handle MaskSize != Op.getNumOperands()?

11576

if (MaskSize == (int)Op.getNumOperands() &&

11577

MaskSize == (int)ExpectedOp.getNumOperands())

11578

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

11579

break;

11580

case X86ISD::VBROADCAST:

11581

case X86ISD::VBROADCAST_LOAD:

11582

// TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?

11583

return (Op == ExpectedOp &&

11584

(int)Op.getValueType().getVectorNumElements() == MaskSize);

11585

case X86ISD::HADD:

11586

case X86ISD::HSUB:

11587

case X86ISD::FHADD:

11588

case X86ISD::FHSUB:

11589

case X86ISD::PACKSS:

11590

case X86ISD::PACKUS:

11591

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

11592

// TODO: Handle MaskSize != NumElts?

11593

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

11594

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

11595

MVT VT = Op.getSimpleValueType();

11596

int NumElts = VT.getVectorNumElements();

11597

if (MaskSize == NumElts) {

11598

int NumLanes = VT.getSizeInBits() / 128;

11599

int NumEltsPerLane = NumElts / NumLanes;

11600

int NumHalfEltsPerLane = NumEltsPerLane / 2;

11601

bool SameLane =

11602

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

11603

bool SameElt =

11604

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

11605

return SameLane && SameElt;

11606

}

11607

}

11608

break;

11609

}

11610

11611

return false;

11612

}

11613

11614

/// Checks whether a shuffle mask is equivalent to an explicit list of

11615

/// arguments.

11616

///

11617

/// This is a fast way to test a shuffle mask against a fixed pattern:

11618

///

11619

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

11620

///

11621

/// It returns true if the mask is exactly as wide as the argument list, and

11622

/// each element of the mask is either -1 (signifying undef) or the value given

11623

/// in the argument.

11624

static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

11625

SDValue V1 = SDValue(),

11626

SDValue V2 = SDValue()) {

11627

int Size = Mask.size();

11628

if (Size != (int)ExpectedMask.size())

11629

return false;

11630

11631

for (int i = 0; i < Size; ++i) {

11632

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11632, __extension__
__PRETTY_FUNCTION__));

11633

int MaskIdx = Mask[i];

11634

int ExpectedIdx = ExpectedMask[i];

11635

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

11636

SDValue MaskV = MaskIdx < Size ? V1 : V2;

11637

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

11638

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

11639

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

11640

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

11641

return false;

11642

}

11643

}

11644

return true;

11645

}

11646

11647

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

11648

///

11649

/// The masks must be exactly the same width.

11650

///

11651

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

11652

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

11653

///

11654

/// SM_SentinelZero is accepted as a valid negative index but must match in

11655

/// both.

11656

static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

11657

ArrayRef<int> ExpectedMask,

11658

SDValue V1 = SDValue(),

11659

SDValue V2 = SDValue()) {

11660

int Size = Mask.size();

11661

if (Size != (int)ExpectedMask.size())

11662

return false;

11663

assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11664, __extension__
__PRETTY_FUNCTION__))

11664

"Illegal target shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11664, __extension__
__PRETTY_FUNCTION__));

11665

11666

// Check for out-of-range target shuffle mask indices.

11667

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

11668

return false;

11669

11670

// Don't use V1/V2 if they're not the same size as the shuffle mask type.

11671

if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())

11672

V1 = SDValue();

11673

if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())

11674

V2 = SDValue();

11675

11676

for (int i = 0; i < Size; ++i) {

11677

int MaskIdx = Mask[i];

11678

int ExpectedIdx = ExpectedMask[i];

11679

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

11680

continue;

11681

if (0 <= MaskIdx && 0 <= ExpectedIdx) {

11682

SDValue MaskV = MaskIdx < Size ? V1 : V2;

11683

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

11684

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

11685

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

11686

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

11687

continue;

11688

}

11689

// TODO - handle SM_Sentinel equivalences.

11690

return false;

11691

}

11692

return true;

11693

}

11694

11695

// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.

11696

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

11697

SDValue Cond, bool IsBLENDV = false) {

11698

EVT CondVT = Cond.getValueType();

11699

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

11700

unsigned NumElts = CondVT.getVectorNumElements();

11701

11702

APInt UndefElts;

11703

SmallVector<APInt, 32> EltBits;

11704

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

11705

true, false))

11706

return false;

11707

11708

Mask.resize(NumElts, SM_SentinelUndef);

11709

11710

for (int i = 0; i != (int)NumElts; ++i) {

11711

Mask[i] = i;

11712

// Arbitrarily choose from the 2nd operand if the select condition element

11713

// is undef.

11714

// TODO: Can we do better by matching patterns such as even/odd?

11715

if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

11716

(IsBLENDV && EltBits[i].isNonNegative()))

11717

Mask[i] += NumElts;

11718

}

11719

11720

return true;

11721

}

11722

11723

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

11724

// instructions.

11725

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {

11726

if (VT != MVT::v8i32 && VT != MVT::v8f32)

11727

return false;

11728

11729

SmallVector<int, 8> Unpcklwd;

11730

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

11731

/* Unary = */ false);

11732

SmallVector<int, 8> Unpckhwd;

11733

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

11734

/* Unary = */ false);

11735

bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||

11736

isTargetShuffleEquivalent(VT, Mask, Unpckhwd));

11737

return IsUnpackwdMask;

11738

}

11739

11740

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {

11741

// Create 128-bit vector type based on mask size.

11742

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

11743

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

11744

11745

// We can't assume a canonical shuffle mask, so try the commuted version too.

11746

SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());

11747

ShuffleVectorSDNode::commuteMask(CommutedMask);

11748

11749

// Match any of unary/binary or low/high.

11750

for (unsigned i = 0; i != 4; ++i) {

11751

SmallVector<int, 16> UnpackMask;

11752

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

11753

if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||

11754

isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))

11755

return true;

11756

}

11757

return false;

11758

}

11759

11760

/// Return true if a shuffle mask chooses elements identically in its top and

11761

/// bottom halves. For example, any splat mask has the same top and bottom

11762

/// halves. If an element is undefined in only one half of the mask, the halves

11763

/// are not considered identical.

11764

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

11765

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11765, __extension__
__PRETTY_FUNCTION__));

11766

unsigned HalfSize = Mask.size() / 2;

11767

for (unsigned i = 0; i != HalfSize; ++i) {

11768

if (Mask[i] != Mask[i + HalfSize])

11769

return false;

11770

}

11771

return true;

11772

}

11773

11774

/// Get a 4-lane 8-bit shuffle immediate for a mask.

11775

///

11776

/// This helper function produces an 8-bit shuffle immediate corresponding to

11777

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

11778

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

11779

/// example.

11780

///

11781

/// NB: We rely heavily on "undef" masks preserving the input lane.

11782

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

11783

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11783, __extension__
__PRETTY_FUNCTION__));

11784

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11784, __extension__
__PRETTY_FUNCTION__));

11785

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11785, __extension__
__PRETTY_FUNCTION__));

11786

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11786, __extension__
__PRETTY_FUNCTION__));

11787

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11787, __extension__
__PRETTY_FUNCTION__));

11788

11789

// If the mask only uses one non-undef element, then fully 'splat' it to

11790

// improve later broadcast matching.

11791

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

11792

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11792, __extension__
__PRETTY_FUNCTION__));

11793

11794

int FirstElt = Mask[FirstIndex];

11795

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

11796

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

11797

11798

unsigned Imm = 0;

11799

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

11800

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

11801

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

11802

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

11803

return Imm;

11804

}

11805

11806

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

11807

SelectionDAG &DAG) {

11808

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

11809

}

11810

11811

// The Shuffle result is as follow:

11812

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

11813

// Each Zeroable's element correspond to a particular Mask's element.

11814

// As described in computeZeroableShuffleElements function.

11815

//

11816

// The function looks for a sub-mask that the nonzero elements are in

11817

// increasing order. If such sub-mask exist. The function returns true.

11818

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

11819

ArrayRef<int> Mask, const EVT &VectorType,

11820

bool &IsZeroSideLeft) {

11821

int NextElement = -1;

11822

// Check if the Mask's nonzero elements are in increasing order.

11823

for (int i = 0, e = Mask.size(); i < e; i++) {

11824

// Checks if the mask's zeros elements are built from only zeros.

11825

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__));

11826

if (Mask[i] < 0)

11827

return false;

11828

if (Zeroable[i])

11829

continue;

11830

// Find the lowest non zero element

11831

if (NextElement < 0) {

11832

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

11833

IsZeroSideLeft = NextElement != 0;

11834

}

11835

// Exit if the mask's non zero elements are not in increasing order.

11836

if (NextElement != Mask[i])

11837

return false;

11838

NextElement++;

11839

}

11840

return true;

11841

}

11842

11843

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

11844

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

11845

ArrayRef<int> Mask, SDValue V1,

11846

SDValue V2, const APInt &Zeroable,

11847

const X86Subtarget &Subtarget,

11848

SelectionDAG &DAG) {

11849

int Size = Mask.size();

11850

int LaneSize = 128 / VT.getScalarSizeInBits();

11851

const int NumBytes = VT.getSizeInBits() / 8;

11852

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

11853

11854

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))

11855

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__))

11856

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__
__PRETTY_FUNCTION__));

11857

11858

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

11859

// Sign bit set in i8 mask means zero element.

11860

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

11861

11862

SDValue V;

11863

for (int i = 0; i < NumBytes; ++i) {

11864

int M = Mask[i / NumEltBytes];

11865

if (M < 0) {

11866

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

11867

continue;

11868

}

11869

if (Zeroable[i / NumEltBytes]) {

11870

PSHUFBMask[i] = ZeroMask;

11871

continue;

11872

}

11873

11874

// We can only use a single input of V1 or V2.

11875

SDValue SrcV = (M >= Size ? V2 : V1);

11876

if (V && V != SrcV)

11877

return SDValue();

11878

V = SrcV;

11879

M %= Size;

11880

11881

// PSHUFB can't cross lanes, ensure this doesn't happen.

11882

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

11883

return SDValue();

11884

11885

M = M % LaneSize;

11886

M = M * NumEltBytes + (i % NumEltBytes);

11887

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

11888

}

11889

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11889, __extension__
__PRETTY_FUNCTION__));

11890

11891

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

11892

return DAG.getBitcast(

11893

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

11894

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

11895

}

11896

11897

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

11898

const X86Subtarget &Subtarget, SelectionDAG &DAG,

11899

const SDLoc &dl);

11900

11901

// X86 has dedicated shuffle that can be lowered to VEXPAND

11902

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

11903

const APInt &Zeroable,

11904

ArrayRef<int> Mask, SDValue &V1,

11905

SDValue &V2, SelectionDAG &DAG,

11906

const X86Subtarget &Subtarget) {

11907

bool IsLeftZeroSide = true;

11908

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

11909

IsLeftZeroSide))

11910

return SDValue();

11911

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

11912

MVT IntegerType =

11913

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

11914

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

11915

unsigned NumElts = VT.getVectorNumElements();

11916

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11917, __extension__
__PRETTY_FUNCTION__))

11917

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11917, __extension__
__PRETTY_FUNCTION__));

11918

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

11919

Subtarget, DAG, DL);

11920

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

11921

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

11922

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

11923

}

11924

11925

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

11926

unsigned &UnpackOpcode, bool IsUnary,

11927

ArrayRef<int> TargetMask, const SDLoc &DL,

11928

SelectionDAG &DAG,

11929

const X86Subtarget &Subtarget) {

11930

int NumElts = VT.getVectorNumElements();

11931

11932

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

11933

for (int i = 0; i != NumElts; i += 2) {

11934

int M1 = TargetMask[i + 0];

11935

int M2 = TargetMask[i + 1];

11936

Undef1 &= (SM_SentinelUndef == M1);

11937

Undef2 &= (SM_SentinelUndef == M2);

11938

Zero1 &= isUndefOrZero(M1);

11939

Zero2 &= isUndefOrZero(M2);

11940

}

11941

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11942, __extension__
__PRETTY_FUNCTION__))

11942

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11942, __extension__
__PRETTY_FUNCTION__));

11943

11944

// Attempt to match the target mask against the unpack lo/hi mask patterns.

11945

SmallVector<int, 64> Unpckl, Unpckh;

11946

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

11947

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,

11948

(IsUnary ? V1 : V2))) {

11949

UnpackOpcode = X86ISD::UNPCKL;

11950

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

11951

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

11952

return true;

11953

}

11954

11955

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

11956

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,

11957

(IsUnary ? V1 : V2))) {

11958

UnpackOpcode = X86ISD::UNPCKH;

11959

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

11960

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

11961

return true;

11962

}

11963

11964

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

11965

if (IsUnary && (Zero1 || Zero2)) {

11966

// Don't bother if we can blend instead.

11967

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

11968

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

11969

return false;

11970

11971

bool MatchLo = true, MatchHi = true;

11972

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

11973

int M = TargetMask[i];

11974

11975

// Ignore if the input is known to be zero or the index is undef.

11976

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

11977

(M == SM_SentinelUndef))

11978

continue;

11979

11980

MatchLo &= (M == Unpckl[i]);

11981

MatchHi &= (M == Unpckh[i]);

11982

}

11983

11984

if (MatchLo || MatchHi) {

11985

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

11986

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

11987

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

11988

return true;

11989

}

11990

}

11991

11992

// If a binary shuffle, commute and try again.

11993

if (!IsUnary) {

11994

ShuffleVectorSDNode::commuteMask(Unpckl);

11995

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {

11996

UnpackOpcode = X86ISD::UNPCKL;

11997

std::swap(V1, V2);

11998

return true;

11999

}

12000

12001

ShuffleVectorSDNode::commuteMask(Unpckh);

12002

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {

12003

UnpackOpcode = X86ISD::UNPCKH;

12004

std::swap(V1, V2);

12005

return true;

12006

}

12007

}

12008

12009

return false;

12010

}

12011

12012

// X86 has dedicated unpack instructions that can handle specific blend

12013

// operations: UNPCKH and UNPCKL.

12014

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

12015

ArrayRef<int> Mask, SDValue V1, SDValue V2,

12016

SelectionDAG &DAG) {

12017

SmallVector<int, 8> Unpckl;

12018

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

12019

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12020

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

12021

12022

SmallVector<int, 8> Unpckh;

12023

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

12024

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12025

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

12026

12027

// Commute and try again.

12028

ShuffleVectorSDNode::commuteMask(Unpckl);

12029

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12030

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

12031

12032

ShuffleVectorSDNode::commuteMask(Unpckh);

12033

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12034

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

12035

12036

return SDValue();

12037

}

12038

12039

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

12040

/// followed by unpack 256-bit.

12041

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

12042

ArrayRef<int> Mask, SDValue V1,

12043

SDValue V2, SelectionDAG &DAG) {

12044

SmallVector<int, 32> Unpckl, Unpckh;

12045

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

12046

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

12047

12048

unsigned UnpackOpcode;

12049

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12050

UnpackOpcode = X86ISD::UNPCKL;

12051

else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12052

UnpackOpcode = X86ISD::UNPCKH;

12053

else

12054

return SDValue();

12055

12056

// This is a "natural" unpack operation (rather than the 128-bit sectored

12057

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

12058

// input in order to use the x86 instruction.

12059

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

12060

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

12061

V1 = DAG.getBitcast(VT, V1);

12062

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

12063

}

12064

12065

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

12066

// source into the lower elements and zeroing the upper elements.

12067

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

12068

ArrayRef<int> Mask, const APInt &Zeroable,

12069

const X86Subtarget &Subtarget) {

12070

if (!VT.is512BitVector() && !Subtarget.hasVLX())

12071

return false;

12072

12073

unsigned NumElts = Mask.size();

12074

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12075

unsigned MaxScale = 64 / EltSizeInBits;

12076

12077

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12078

unsigned SrcEltBits = EltSizeInBits * Scale;

12079

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12080

continue;

12081

unsigned NumSrcElts = NumElts / Scale;

12082

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

12083

continue;

12084

unsigned UpperElts = NumElts - NumSrcElts;

12085

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12086

continue;

12087

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

12088

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

12089

DstVT = MVT::getIntegerVT(EltSizeInBits);

12090

if ((NumSrcElts * EltSizeInBits) >= 128) {

12091

// ISD::TRUNCATE

12092

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

12093

} else {

12094

// X86ISD::VTRUNC

12095

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

12096

}

12097

return true;

12098

}

12099

12100

return false;

12101

}

12102

12103

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

12104

// element padding to the final DstVT.

12105

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

12106

const X86Subtarget &Subtarget,

12107

SelectionDAG &DAG, bool ZeroUppers) {

12108

MVT SrcVT = Src.getSimpleValueType();

12109

MVT DstSVT = DstVT.getScalarType();

12110

unsigned NumDstElts = DstVT.getVectorNumElements();

12111

unsigned NumSrcElts = SrcVT.getVectorNumElements();

12112

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

12113

12114

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

12115

return SDValue();

12116

12117

// Perform a direct ISD::TRUNCATE if possible.

12118

if (NumSrcElts == NumDstElts)

12119

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

12120

12121

if (NumSrcElts > NumDstElts) {

12122

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12123

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12124

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

12125

}

12126

12127

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

12128

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12129

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12130

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12131

DstVT.getSizeInBits());

12132

}

12133

12134

// Non-VLX targets must truncate from a 512-bit type, so we need to

12135

// widen, truncate and then possibly extract the original subvector.

12136

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

12137

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

12138

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

12139

}

12140

12141

// Fallback to a X86ISD::VTRUNC, padding if necessary.

12142

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

12143

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

12144

if (DstVT != TruncVT)

12145

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12146

DstVT.getSizeInBits());

12147

return Trunc;

12148

}

12149

12150

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

12151

//

12152

// An example is the following:

12153

//

12154

// t0: ch = EntryToken

12155

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

12156

// t25: v4i32 = truncate t2

12157

// t41: v8i16 = bitcast t25

12158

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

12159

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

12160

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

12161

// t18: v2i64 = bitcast t51

12162

//

12163

// One can just use a single vpmovdw instruction, without avx512vl we need to

12164

// use the zmm variant and extract the lower subvector, padding with zeroes.

12165

// TODO: Merge with lowerShuffleAsVTRUNC.

12166

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

12167

SDValue V2, ArrayRef<int> Mask,

12168

const APInt &Zeroable,

12169

const X86Subtarget &Subtarget,

12170

SelectionDAG &DAG) {

12171

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12171, __extension__
__PRETTY_FUNCTION__));

12172

if (!Subtarget.hasAVX512())

12173

return SDValue();

12174

12175

unsigned NumElts = VT.getVectorNumElements();

12176

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12177

unsigned MaxScale = 64 / EltSizeInBits;

12178

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12179

unsigned NumSrcElts = NumElts / Scale;

12180

unsigned UpperElts = NumElts - NumSrcElts;

12181

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12182

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12183

continue;

12184

12185

SDValue Src = V1;

12186

if (!Src.hasOneUse())

12187

return SDValue();

12188

12189

Src = peekThroughOneUseBitcasts(Src);

12190

if (Src.getOpcode() != ISD::TRUNCATE ||

12191

Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))

12192

return SDValue();

12193

Src = Src.getOperand(0);

12194

12195

// VPMOVWB is only available with avx512bw.

12196

MVT SrcVT = Src.getSimpleValueType();

12197

if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&

12198

!Subtarget.hasBWI())

12199

return SDValue();

12200

12201

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

12202

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12203

}

12204

12205

return SDValue();

12206

}

12207

12208

// Attempt to match binary shuffle patterns as a truncate.

12209

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

12210

SDValue V2, ArrayRef<int> Mask,

12211

const APInt &Zeroable,

12212

const X86Subtarget &Subtarget,

12213

SelectionDAG &DAG) {

12214

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12215, __extension__
__PRETTY_FUNCTION__))

12215

"Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12215, __extension__
__PRETTY_FUNCTION__));

12216

if (!Subtarget.hasAVX512())

12217

return SDValue();

12218

12219

unsigned NumElts = VT.getVectorNumElements();

12220

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12221

unsigned MaxScale = 64 / EltSizeInBits;

12222

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12223

// TODO: Support non-BWI VPMOVWB truncations?

12224

unsigned SrcEltBits = EltSizeInBits * Scale;

12225

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12226

continue;

12227

12228

// Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>

12229

// Bail if the V2 elements are undef.

12230

unsigned NumHalfSrcElts = NumElts / Scale;

12231

unsigned NumSrcElts = 2 * NumHalfSrcElts;

12232

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12233

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

12234

continue;

12235

12236

// The elements beyond the truncation must be undef/zero.

12237

unsigned UpperElts = NumElts - NumSrcElts;

12238

if (UpperElts > 0 &&

12239

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12240

continue;

12241

bool UndefUppers =

12242

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

12243

12244

// As we're using both sources then we need to concat them together

12245

// and truncate from the double-sized src.

12246

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

12247

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

12248

12249

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12250

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12251

Src = DAG.getBitcast(SrcVT, Src);

12252

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12253

}

12254

12255

return SDValue();

12256

}

12257

12258

/// Check whether a compaction lowering can be done by dropping even/odd

12259

/// elements and compute how many times even/odd elements must be dropped.

12260

///

12261

/// This handles shuffles which take every Nth element where N is a power of

12262

/// two. Example shuffle masks:

12263

///

12264

/// (even)

12265

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

12266

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

12267

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

12268

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

12269

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

12270

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

12271

///

12272

/// (odd)

12273

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14

12274

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

12275

///

12276

/// Any of these lanes can of course be undef.

12277

///

12278

/// This routine only supports N <= 3.

12279

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

12280

/// for larger N.

12281

///

12282

/// \returns N above, or the number of times even/odd elements must be dropped

12283

/// if there is such a number. Otherwise returns zero.

12284

static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

12285

bool IsSingleInput) {

12286

// The modulus for the shuffle vector entries is based on whether this is

12287

// a single input or not.

12288

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

12289

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12290, __extension__
__PRETTY_FUNCTION__))

12290

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12290, __extension__
__PRETTY_FUNCTION__));

12291

12292

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

12293

int Offset = MatchEven ? 0 : 1;

12294

12295

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

12296

// and 2^3 simultaneously. This is because we may have ambiguity with

12297

// partially undef inputs.

12298

bool ViableForN[3] = {true, true, true};

12299

12300

for (int i = 0, e = Mask.size(); i < e; ++i) {

12301

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

12302

// want.

12303

if (Mask[i] < 0)

12304

continue;

12305

12306

bool IsAnyViable = false;

12307

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

12308

if (ViableForN[j]) {

12309

uint64_t N = j + 1;

12310

12311

// The shuffle mask must be equal to (i * 2^N) % M.

12312

if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

12313

IsAnyViable = true;

12314

else

12315

ViableForN[j] = false;

12316

}

12317

// Early exit if we exhaust the possible powers of two.

12318

if (!IsAnyViable)

12319

break;

12320

}

12321

12322

for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)

12323

if (ViableForN[j])

12324

return j + 1;

12325

12326

// Return 0 as there is no viable power of two.

12327

return 0;

12328

}

12329

12330

// X86 has dedicated pack instructions that can handle specific truncation

12331

// operations: PACKSS and PACKUS.

12332

// Checks for compaction shuffle masks if MaxStages > 1.

12333

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

12334

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

12335

unsigned &PackOpcode, ArrayRef<int> TargetMask,

12336

const SelectionDAG &DAG,

12337

const X86Subtarget &Subtarget,

12338

unsigned MaxStages = 1) {

12339

unsigned NumElts = VT.getVectorNumElements();

12340

unsigned BitSize = VT.getScalarSizeInBits();

12341

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12342, __extension__
__PRETTY_FUNCTION__))

12342

"Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12342, __extension__
__PRETTY_FUNCTION__));

12343

12344

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

12345

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

12346

unsigned NumPackedBits = NumSrcBits - BitSize;

12347

N1 = peekThroughBitcasts(N1);

12348

N2 = peekThroughBitcasts(N2);

12349

unsigned NumBits1 = N1.getScalarValueSizeInBits();

12350

unsigned NumBits2 = N2.getScalarValueSizeInBits();

12351

bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

12352

bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

12353

if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

12354

(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

12355

return false;

12356

if (Subtarget.hasSSE41() || BitSize == 8) {

12357

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

12358

if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

12359

(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

12360

V1 = N1;

12361

V2 = N2;

12362

SrcVT = PackVT;

12363

PackOpcode = X86ISD::PACKUS;

12364

return true;

12365

}

12366

}

12367

bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

12368

bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

12369

if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

12370

DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

12371

(N2.isUndef() || IsZero2 || IsAllOnes2 ||

12372

DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

12373

V1 = N1;

12374

V2 = N2;

12375

SrcVT = PackVT;

12376

PackOpcode = X86ISD::PACKSS;

12377

return true;

12378

}

12379

return false;

12380

};

12381

12382

// Attempt to match against wider and wider compaction patterns.

12383

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

12384

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

12385

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

12386

12387

// Try binary shuffle.

12388

SmallVector<int, 32> BinaryMask;

12389

createPackShuffleMask(VT, BinaryMask, false, NumStages);

12390

if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))

12391

if (MatchPACK(V1, V2, PackVT))

12392

return true;

12393

12394

// Try unary shuffle.

12395

SmallVector<int, 32> UnaryMask;

12396

createPackShuffleMask(VT, UnaryMask, true, NumStages);

12397

if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))

12398

if (MatchPACK(V1, V1, PackVT))

12399

return true;

12400

}

12401

12402

return false;

12403

}

12404

12405

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

12406

SDValue V1, SDValue V2, SelectionDAG &DAG,

12407

const X86Subtarget &Subtarget) {

12408

MVT PackVT;

12409

unsigned PackOpcode;

12410

unsigned SizeBits = VT.getSizeInBits();

12411

unsigned EltBits = VT.getScalarSizeInBits();

12412

unsigned MaxStages = Log2_32(64 / EltBits);

12413

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

12414

Subtarget, MaxStages))

12415

return SDValue();

12416

12417

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

12418

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

12419

12420

// Don't lower multi-stage packs on AVX512, truncation is better.

12421

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

12422

return SDValue();

12423

12424

// Pack to the largest type possible:

12425

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

12426

unsigned MaxPackBits = 16;

12427

if (CurrentEltBits > 16 &&

12428

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

12429

MaxPackBits = 32;

12430

12431

// Repeatedly pack down to the target size.

12432

SDValue Res;

12433

for (unsigned i = 0; i != NumStages; ++i) {

12434

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

12435

unsigned NumSrcElts = SizeBits / SrcEltBits;

12436

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12437

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

12438

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12439

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

12440

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

12441

DAG.getBitcast(SrcVT, V2));

12442

V1 = V2 = Res;

12443

CurrentEltBits /= 2;

12444

}

12445

assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12446, __extension__
__PRETTY_FUNCTION__))

12446

"Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12446, __extension__
__PRETTY_FUNCTION__));

12447

return Res;

12448

}

12449

12450

/// Try to emit a bitmask instruction for a shuffle.

12451

///

12452

/// This handles cases where we can model a blend exactly as a bitmask due to

12453

/// one of the inputs being zeroable.

12454

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

12455

SDValue V2, ArrayRef<int> Mask,

12456

const APInt &Zeroable,

12457

const X86Subtarget &Subtarget,

12458

SelectionDAG &DAG) {

12459

MVT MaskVT = VT;

12460

MVT EltVT = VT.getVectorElementType();

12461

SDValue Zero, AllOnes;

12462

// Use f64 if i64 isn't legal.

12463

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

12464

EltVT = MVT::f64;

12465

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

12466

}

12467

12468

MVT LogicVT = VT;

12469

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

12470

Zero = DAG.getConstantFP(0.0, DL, EltVT);

12471

APFloat AllOnesValue =

12472

APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));

12473

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

12474

LogicVT =

12475

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

12476

} else {

12477

Zero = DAG.getConstant(0, DL, EltVT);

12478

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12479

}

12480

12481

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

12482

SDValue V;

12483

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12484

if (Zeroable[i])

12485

continue;

12486

if (Mask[i] % Size != i)

12487

return SDValue(); // Not a blend.

12488

if (!V)

12489

V = Mask[i] < Size ? V1 : V2;

12490

else if (V != (Mask[i] < Size ? V1 : V2))

12491

return SDValue(); // Can only let one input through the mask.

12492

12493

VMaskOps[i] = AllOnes;

12494

}

12495

if (!V)

12496

return SDValue(); // No non-zeroable elements!

12497

12498

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

12499

VMask = DAG.getBitcast(LogicVT, VMask);

12500

V = DAG.getBitcast(LogicVT, V);

12501

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

12502

return DAG.getBitcast(VT, And);

12503

}

12504

12505

/// Try to emit a blend instruction for a shuffle using bit math.

12506

///

12507

/// This is used as a fallback approach when first class blend instructions are

12508

/// unavailable. Currently it is only suitable for integer vectors, but could

12509

/// be generalized for floating point vectors if desirable.

12510

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

12511

SDValue V2, ArrayRef<int> Mask,

12512

SelectionDAG &DAG) {

12513

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12513, __extension__
__PRETTY_FUNCTION__));

12514

MVT EltVT = VT.getVectorElementType();

12515

SDValue Zero = DAG.getConstant(0, DL, EltVT);

12516

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12517

SmallVector<SDValue, 16> MaskOps;

12518

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12519

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

12520

return SDValue(); // Shuffled input!

12521

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

12522

}

12523

12524

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

12525

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

12526

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

12527

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

12528

}

12529

12530

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

12531

SDValue PreservedSrc,

12532

const X86Subtarget &Subtarget,

12533

SelectionDAG &DAG);

12534

12535

static bool matchShuffleAsBlend(SDValue V1, SDValue V2,

12536

MutableArrayRef<int> Mask,

12537

const APInt &Zeroable, bool &ForceV1Zero,

12538

bool &ForceV2Zero, uint64_t &BlendMask) {

12539

bool V1IsZeroOrUndef =

12540

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

12541

bool V2IsZeroOrUndef =

12542

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

12543

12544

BlendMask = 0;

12545

ForceV1Zero = false, ForceV2Zero = false;

12546

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12546, __extension__
__PRETTY_FUNCTION__));

12547

12548

// Attempt to generate the binary blend mask. If an input is zero then

12549

// we can use any lane.

12550

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12551

int M = Mask[i];

12552

if (M == SM_SentinelUndef)

12553

continue;

12554

if (M == i ||

12555

(0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {

12556

Mask[i] = i;

12557

continue;

12558

}

12559

if (M == (i + Size) ||

12560

(Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {

12561

BlendMask |= 1ull << i;

12562

Mask[i] = i + Size;

12563

continue;

12564

}

12565

if (Zeroable[i]) {

12566

if (V1IsZeroOrUndef) {

12567

ForceV1Zero = true;

12568

Mask[i] = i;

12569

continue;

12570

}

12571

if (V2IsZeroOrUndef) {

12572

ForceV2Zero = true;

12573

BlendMask |= 1ull << i;

12574

Mask[i] = i + Size;

12575

continue;

12576

}

12577

}

12578

return false;

12579

}

12580

return true;

12581

}

12582

12583

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

12584

int Scale) {

12585

uint64_t ScaledMask = 0;

12586

for (int i = 0; i != Size; ++i)

12587

if (BlendMask & (1ull << i))

12588

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

12589

return ScaledMask;

12590

}

12591

12592

/// Try to emit a blend instruction for a shuffle.

12593

///

12594

/// This doesn't do any checks for the availability of instructions for blending

12595

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

12596

/// be matched in the backend with the type given. What it does check for is

12597

/// that the shuffle mask is a blend, or convertible into a blend with zero.

12598

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

12599

SDValue V2, ArrayRef<int> Original,

12600

const APInt &Zeroable,

12601

const X86Subtarget &Subtarget,

12602

SelectionDAG &DAG) {

12603

uint64_t BlendMask = 0;

12604

bool ForceV1Zero = false, ForceV2Zero = false;

12605

SmallVector<int, 64> Mask(Original.begin(), Original.end());

12606

if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

12607

BlendMask))

12608

return SDValue();

12609

12610

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

12611

if (ForceV1Zero)

12612

V1 = getZeroVector(VT, Subtarget, DAG, DL);

12613

if (ForceV2Zero)

12614

V2 = getZeroVector(VT, Subtarget, DAG, DL);

12615

12616

unsigned NumElts = VT.getVectorNumElements();

12617

12618

switch (VT.SimpleTy) {

12619

case MVT::v4i64:

12620

case MVT::v8i32:

12621

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12621, __extension__
__PRETTY_FUNCTION__));

12622

LLVM_FALLTHROUGH[[gnu::fallthrough]];

12623

case MVT::v4f64:

12624

case MVT::v8f32:

12625

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12625, __extension__
__PRETTY_FUNCTION__));

12626

LLVM_FALLTHROUGH[[gnu::fallthrough]];

12627

case MVT::v2f64:

12628

case MVT::v2i64:

12629

case MVT::v4f32:

12630

case MVT::v4i32:

12631

case MVT::v8i16:

12632

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12632, __extension__
__PRETTY_FUNCTION__));

12633

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

12634

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

12635

case MVT::v16i16: {

12636

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__));

12637

SmallVector<int, 8> RepeatedMask;

12638

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

12639

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

12640

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12640, __extension__
__PRETTY_FUNCTION__));

12641

BlendMask = 0;

12642

for (int i = 0; i < 8; ++i)

12643

if (RepeatedMask[i] >= 8)

12644

BlendMask |= 1ull << i;

12645

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

12646

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

12647

}

12648

// Use PBLENDW for lower/upper lanes and then blend lanes.

12649

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

12650

// merge to VSELECT where useful.

12651

uint64_t LoMask = BlendMask & 0xFF;

12652

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

12653

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

12654

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

12655

DAG.getTargetConstant(LoMask, DL, MVT::i8));

12656

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

12657

DAG.getTargetConstant(HiMask, DL, MVT::i8));

12658

return DAG.getVectorShuffle(

12659

MVT::v16i16, DL, Lo, Hi,

12660

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

12661

}

12662

LLVM_FALLTHROUGH[[gnu::fallthrough]];

12663

}

12664

case MVT::v32i8:

12665

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12665, __extension__
__PRETTY_FUNCTION__));

12666

LLVM_FALLTHROUGH[[gnu::fallthrough]];

12667

case MVT::v16i8: {

12668

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12668, __extension__
__PRETTY_FUNCTION__));

12669

12670

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

12671

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

12672

Subtarget, DAG))

12673

return Masked;

12674

12675

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

12676

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

12677

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

12678

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

12679

}

12680

12681

// If we have VPTERNLOG, we can use that as a bit blend.

12682

if (Subtarget.hasVLX())

12683

if (SDValue BitBlend =

12684

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

12685

return BitBlend;

12686

12687

// Scale the blend by the number of bytes per element.

12688

int Scale = VT.getScalarSizeInBits() / 8;

12689

12690

// This form of blend is always done on bytes. Compute the byte vector

12691

// type.

12692

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

12693

12694

// x86 allows load folding with blendvb from the 2nd source operand. But

12695

// we are still using LLVM select here (see comment below), so that's V1.

12696

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

12697

// allow that load-folding possibility.

12698

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

12699

ShuffleVectorSDNode::commuteMask(Mask);

12700

std::swap(V1, V2);

12701

}

12702

12703

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

12704

// mix of LLVM's code generator and the x86 backend. We tell the code

12705

// generator that boolean values in the elements of an x86 vector register

12706

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

12707

// mapping a select to operand #1, and 'false' mapping to operand #2. The

12708

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

12709

// of the element (the remaining are ignored) and 0 in that high bit would

12710

// mean operand #1 while 1 in the high bit would mean operand #2. So while

12711

// the LLVM model for boolean values in vector elements gets the relevant

12712

// bit set, it is set backwards and over constrained relative to x86's

12713

// actual model.

12714

SmallVector<SDValue, 32> VSELECTMask;

12715

for (int i = 0, Size = Mask.size(); i < Size; ++i)

12716

for (int j = 0; j < Scale; ++j)

12717

VSELECTMask.push_back(

12718

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

12719

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

12720

MVT::i8));

12721

12722

V1 = DAG.getBitcast(BlendVT, V1);

12723

V2 = DAG.getBitcast(BlendVT, V2);

12724

return DAG.getBitcast(

12725

VT,

12726

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

12727

V1, V2));

12728

}

12729

case MVT::v16f32:

12730

case MVT::v8f64:

12731

case MVT::v8i64:

12732

case MVT::v16i32:

12733

case MVT::v32i16:

12734

case MVT::v64i8: {

12735

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

12736

bool OptForSize = DAG.shouldOptForSize();

12737

if (!OptForSize) {

12738

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

12739

Subtarget, DAG))

12740

return Masked;

12741

}

12742

12743

// Otherwise load an immediate into a GPR, cast to k-register, and use a

12744

// masked move.

12745

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

12746

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

12747

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

12748

}

12749

default:

12750

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12750);

12751

}

12752

}

12753

12754

/// Try to lower as a blend of elements from two inputs followed by

12755

/// a single-input permutation.

12756

///

12757

/// This matches the pattern where we can blend elements from two inputs and

12758

/// then reduce the shuffle to a single-input permutation.

12759

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

12760

SDValue V1, SDValue V2,

12761

ArrayRef<int> Mask,

12762

SelectionDAG &DAG,

12763

bool ImmBlends = false) {

12764

// We build up the blend mask while checking whether a blend is a viable way

12765

// to reduce the shuffle.

12766

SmallVector<int, 32> BlendMask(Mask.size(), -1);

12767

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

12768

12769

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12770

if (Mask[i] < 0)

12771

continue;

12772

12773

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12773, __extension__
__PRETTY_FUNCTION__));

12774

12775

if (BlendMask[Mask[i] % Size] < 0)

12776

BlendMask[Mask[i] % Size] = Mask[i];

12777

else if (BlendMask[Mask[i] % Size] != Mask[i])

12778

return SDValue(); // Can't blend in the needed input!

12779

12780

PermuteMask[i] = Mask[i] % Size;

12781

}

12782

12783

// If only immediate blends, then bail if the blend mask can't be widened to

12784

// i16.

12785

unsigned EltSize = VT.getScalarSizeInBits();

12786

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

12787

return SDValue();

12788

12789

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

12790

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

12791

}

12792

12793

/// Try to lower as an unpack of elements from two inputs followed by

12794

/// a single-input permutation.

12795

///

12796

/// This matches the pattern where we can unpack elements from two inputs and

12797

/// then reduce the shuffle to a single-input (wider) permutation.

12798

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

12799

SDValue V1, SDValue V2,

12800

ArrayRef<int> Mask,

12801

SelectionDAG &DAG) {

12802

int NumElts = Mask.size();

12803

int NumLanes = VT.getSizeInBits() / 128;

12804

int NumLaneElts = NumElts / NumLanes;

12805

int NumHalfLaneElts = NumLaneElts / 2;

12806

12807

bool MatchLo = true, MatchHi = true;

12808

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

12809

12810

// Determine UNPCKL/UNPCKH type and operand order.

12811

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

12812

for (int Elt = 0; Elt != NumLaneElts; ++Elt) {

12813

int M = Mask[Lane + Elt];

12814

if (M < 0)

12815

continue;

12816

12817

SDValue &Op = Ops[Elt & 1];

12818

if (M < NumElts && (Op.isUndef() || Op == V1))

12819

Op = V1;

12820

else if (NumElts <= M && (Op.isUndef() || Op == V2))

12821

Op = V2;

12822

else

12823

return SDValue();

12824

12825

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

12826

MatchLo &= isUndefOrInRange(M, Lo, Mid) ||

12827

isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);

12828

MatchHi &= isUndefOrInRange(M, Mid, Hi) ||

12829

isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);

12830

if (!MatchLo && !MatchHi)

12831

return SDValue();

12832

}

12833

}

12834

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12834, __extension__
__PRETTY_FUNCTION__));

12835

12836

// Now check that each pair of elts come from the same unpack pair

12837

// and set the permute mask based on each pair.

12838

// TODO - Investigate cases where we permute individual elements.

12839

SmallVector<int, 32> PermuteMask(NumElts, -1);

12840

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

12841

for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {

12842

int M0 = Mask[Lane + Elt + 0];

12843

int M1 = Mask[Lane + Elt + 1];

12844

if (0 <= M0 && 0 <= M1 &&

12845

(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))

12846

return SDValue();

12847

if (0 <= M0)

12848

PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));

12849

if (0 <= M1)

12850

PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;

12851

}

12852

}

12853

12854

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12855

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

12856

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

12857

}

12858

12859

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

12860

/// permuting the elements of the result in place.

12861

static SDValue lowerShuffleAsByteRotateAndPermute(

12862

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12863

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12864

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

12865

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

12866

(VT.is512BitVector() && !Subtarget.hasBWI()))

12867

return SDValue();

12868

12869

// We don't currently support lane crossing permutes.

12870

if (is128BitLaneCrossingShuffleMask(VT, Mask))

12871

return SDValue();

12872

12873

int Scale = VT.getScalarSizeInBits() / 8;

12874

int NumLanes = VT.getSizeInBits() / 128;

12875

int NumElts = VT.getVectorNumElements();

12876

int NumEltsPerLane = NumElts / NumLanes;

12877

12878

// Determine range of mask elts.

12879

bool Blend1 = true;

12880

bool Blend2 = true;

12881

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

12882

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

12883

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

12884

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

12885

int M = Mask[Lane + Elt];

12886

if (M < 0)

12887

continue;

12888

if (M < NumElts) {

12889

Blend1 &= (M == (Lane + Elt));

12890

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12890, __extension__
__PRETTY_FUNCTION__));

12891

M = M % NumEltsPerLane;

12892

Range1.first = std::min(Range1.first, M);

12893

Range1.second = std::max(Range1.second, M);

12894

} else {

12895

M -= NumElts;

12896

Blend2 &= (M == (Lane + Elt));

12897

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12897, __extension__
__PRETTY_FUNCTION__));

12898

M = M % NumEltsPerLane;

12899

Range2.first = std::min(Range2.first, M);

12900

Range2.second = std::max(Range2.second, M);

12901

}

12902

}

12903

}

12904

12905

// Bail if we don't need both elements.

12906

// TODO - it might be worth doing this for unary shuffles if the permute

12907

// can be widened.

12908

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

12909

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

12910

return SDValue();

12911

12912

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

12913

return SDValue();

12914

12915

// Rotate the 2 ops so we can access both ranges, then permute the result.

12916

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

12917

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

12918

SDValue Rotate = DAG.getBitcast(

12919

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

12920

DAG.getBitcast(ByteVT, Lo),

12921

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

12922

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

12923

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

12924

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

12925

int M = Mask[Lane + Elt];

12926

if (M < 0)

12927

continue;

12928

if (M < NumElts)

12929

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

12930

else

12931

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

12932

}

12933

}

12934

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

12935

};

12936

12937

// Check if the ranges are small enough to rotate from either direction.

12938

if (Range2.second < Range1.first)

12939

return RotateAndPermute(V1, V2, Range1.first, 0);

12940

if (Range1.second < Range2.first)

12941

return RotateAndPermute(V2, V1, Range2.first, NumElts);

12942

return SDValue();

12943

}

12944

12945

static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

12946

return isUndefOrEqual(Mask, 0);

12947

}

12948

12949

static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

12950

return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

12951

}

12952

12953

/// Generic routine to decompose a shuffle and blend into independent

12954

/// blends and permutes.

12955

///

12956

/// This matches the extremely common pattern for handling combined

12957

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

12958

/// operations. It will try to pick the best arrangement of shuffles and

12959

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

12960

static SDValue lowerShuffleAsDecomposedShuffleMerge(

12961

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

12962

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

12963

int NumElts = Mask.size();

12964

int NumLanes = VT.getSizeInBits() / 128;

12965

int NumEltsPerLane = NumElts / NumLanes;

12966

12967

// Shuffle the input elements into the desired positions in V1 and V2 and

12968

// unpack/blend them together.

12969

bool IsAlternating = true;

12970

SmallVector<int, 32> V1Mask(NumElts, -1);

12971

SmallVector<int, 32> V2Mask(NumElts, -1);

12972

SmallVector<int, 32> FinalMask(NumElts, -1);

12973

for (int i = 0; i < NumElts; ++i) {

12974

int M = Mask[i];

12975

if (M >= 0 && M < NumElts) {

12976

V1Mask[i] = M;

12977

FinalMask[i] = i;

12978

IsAlternating &= (i & 1) == 0;

12979

} else if (M >= NumElts) {

12980

V2Mask[i] = M - NumElts;

12981

FinalMask[i] = i + NumElts;

12982

IsAlternating &= (i & 1) == 1;

12983

}

12984

}

12985

12986

// If we effectively only demand the 0'th element of \p Input, and not only

12987

// as 0'th element, then broadcast said input,

12988

// and change \p InputMask to be a no-op (identity) mask.

12989

auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

12990

&DAG](SDValue &Input,

12991

MutableArrayRef<int> InputMask) {

12992

unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

12993

if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

12994

!X86::mayFoldLoad(Input, Subtarget)))

12995

return;

12996

if (isNoopShuffleMask(InputMask))

12997

return;

12998

assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__))

12999

"Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__));

13000

Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

13001

for (auto I : enumerate(InputMask)) {

13002

int &InputMaskElt = I.value();

13003

if (InputMaskElt >= 0)

13004

InputMaskElt = I.index();

13005

}

13006

};

13007

13008

// Currently, we may need to produce one shuffle per input, and blend results.

13009

// It is possible that the shuffle for one of the inputs is already a no-op.

13010

// See if we can simplify non-no-op shuffles into broadcasts,

13011

// which we consider to be strictly better than an arbitrary shuffle.

13012

if (isNoopOrBroadcastShuffleMask(V1Mask) &&

13013

isNoopOrBroadcastShuffleMask(V2Mask)) {

13014

canonicalizeBroadcastableInput(V1, V1Mask);

13015

canonicalizeBroadcastableInput(V2, V2Mask);

13016

}

13017

13018

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

13019

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

13020

// the shuffle may be able to fold with a load or other benefit. However, when

13021

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

13022

// pre-shuffle first is a better strategy.

13023

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

13024

// Only prefer immediate blends to unpack/rotate.

13025

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13026

DAG, true))

13027

return BlendPerm;

13028

if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,

13029

DAG))

13030

return UnpackPerm;

13031

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

13032

DL, VT, V1, V2, Mask, Subtarget, DAG))

13033

return RotatePerm;

13034

// Unpack/rotate failed - try again with variable blends.

13035

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13036

DAG))

13037

return BlendPerm;

13038

}

13039

13040

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

13041

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

13042

// TODO: It doesn't have to be alternating - but each lane mustn't have more

13043

// than half the elements coming from each source.

13044

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

13045

V1Mask.assign(NumElts, -1);

13046

V2Mask.assign(NumElts, -1);

13047

FinalMask.assign(NumElts, -1);

13048

for (int i = 0; i != NumElts; i += NumEltsPerLane)

13049

for (int j = 0; j != NumEltsPerLane; ++j) {

13050

int M = Mask[i + j];

13051

if (M >= 0 && M < NumElts) {

13052

V1Mask[i + (j / 2)] = M;

13053

FinalMask[i + j] = i + (j / 2);

13054

} else if (M >= NumElts) {

13055

V2Mask[i + (j / 2)] = M - NumElts;

13056

FinalMask[i + j] = i + (j / 2) + NumElts;

13057

}

13058

}

13059

}

13060

13061

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13062

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13063

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

13064

}

13065

13066

/// Try to lower a vector shuffle as a bit rotation.

13067

///

13068

/// Look for a repeated rotation pattern in each sub group.

13069

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

13070

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

13071

int NumElts = Mask.size();

13072

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13072, __extension__
__PRETTY_FUNCTION__));

13073

13074

int RotateAmt = -1;

13075

for (int i = 0; i != NumElts; i += NumSubElts) {

13076

for (int j = 0; j != NumSubElts; ++j) {

13077

int M = Mask[i + j];

13078

if (M < 0)

13079

continue;

13080

if (!isInRange(M, i, i + NumSubElts))

13081

return -1;

13082

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

13083

if (0 <= RotateAmt && Offset != RotateAmt)

13084

return -1;

13085

RotateAmt = Offset;

13086

}

13087

}

13088

return RotateAmt;

13089

}

13090

13091

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

13092

const X86Subtarget &Subtarget,

13093

ArrayRef<int> Mask) {

13094

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13094, __extension__
__PRETTY_FUNCTION__));

13095

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13095, __extension__
__PRETTY_FUNCTION__));

13096

13097

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

13098

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

13099

int MaxSubElts = 64 / EltSizeInBits;

13100

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

13101

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

13102

if (RotateAmt < 0)

13103

continue;

13104

13105

int NumElts = Mask.size();

13106

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

13107

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

13108

return RotateAmt * EltSizeInBits;

13109

}

13110

13111

return -1;

13112

}

13113

13114

/// Lower shuffle using X86ISD::VROTLI rotations.

13115

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

13116

ArrayRef<int> Mask,

13117

const X86Subtarget &Subtarget,

13118

SelectionDAG &DAG) {

13119

// Only XOP + AVX512 targets have bit rotation instructions.

13120

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

13121

bool IsLegal =

13122

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

13123

if (!IsLegal && Subtarget.hasSSE3())

13124

return SDValue();

13125

13126

MVT RotateVT;

13127

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

13128

Subtarget, Mask);

13129

if (RotateAmt < 0)

13130

return SDValue();

13131

13132

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

13133

// expanded to OR(SRL,SHL), will be more efficient, but if they can

13134

// widen to vXi16 or more then existing lowering should will be better.

13135

if (!IsLegal) {

13136

if ((RotateAmt % 16) == 0)

13137

return SDValue();

13138

// TODO: Use getTargetVShiftByConstNode.

13139

unsigned ShlAmt = RotateAmt;

13140

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

13141

V1 = DAG.getBitcast(RotateVT, V1);

13142

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

13143

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

13144

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

13145

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

13146

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

13147

return DAG.getBitcast(VT, Rot);

13148

}

13149

13150

SDValue Rot =

13151

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

13152

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

13153

return DAG.getBitcast(VT, Rot);

13154

}

13155

13156

/// Try to match a vector shuffle as an element rotation.

13157

///

13158

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

13159

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

13160

ArrayRef<int> Mask) {

13161

int NumElts = Mask.size();

13162

13163

// We need to detect various ways of spelling a rotation:

13164

// [11, 12, 13, 14, 15, 0, 1, 2]

13165

// [-1, 12, 13, 14, -1, -1, 1, -1]

13166

// [-1, -1, -1, -1, -1, -1, 1, 2]

13167

// [ 3, 4, 5, 6, 7, 8, 9, 10]

13168

// [-1, 4, 5, 6, -1, -1, 9, -1]

13169

// [-1, 4, 5, 6, -1, -1, -1, -1]

13170

int Rotation = 0;

13171

SDValue Lo, Hi;

13172

for (int i = 0; i < NumElts; ++i) {

13173

int M = Mask[i];

13174

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13175, __extension__
__PRETTY_FUNCTION__))

13175

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13175, __extension__
__PRETTY_FUNCTION__));

13176

if (M < 0)

13177

continue;

13178

13179

// Determine where a rotated vector would have started.

13180

int StartIdx = i - (M % NumElts);

13181

if (StartIdx == 0)

13182

// The identity rotation isn't interesting, stop.

13183

return -1;

13184

13185

// If we found the tail of a vector the rotation must be the missing

13186

// front. If we found the head of a vector, it must be how much of the

13187

// head.

13188

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

13189

13190

if (Rotation == 0)

13191

Rotation = CandidateRotation;

13192

else if (Rotation != CandidateRotation)

13193

// The rotations don't match, so we can't match this mask.

13194

return -1;

13195

13196

// Compute which value this mask is pointing at.

13197

SDValue MaskV = M < NumElts ? V1 : V2;

13198

13199

// Compute which of the two target values this index should be assigned

13200

// to. This reflects whether the high elements are remaining or the low

13201

// elements are remaining.

13202

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

13203

13204

// Either set up this value if we've not encountered it before, or check

13205

// that it remains consistent.

13206

if (!TargetV)

13207

TargetV = MaskV;

13208

else if (TargetV != MaskV)

13209

// This may be a rotation, but it pulls from the inputs in some

13210

// unsupported interleaving.

13211

return -1;

13212

}

13213

13214

// Check that we successfully analyzed the mask, and normalize the results.

13215

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13215, __extension__
__PRETTY_FUNCTION__));

13216

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13216, __extension__
__PRETTY_FUNCTION__));

13217

if (!Lo)

13218

Lo = Hi;

13219

else if (!Hi)

13220

Hi = Lo;

13221

13222

V1 = Lo;

13223

V2 = Hi;

13224

13225

return Rotation;

13226

}

13227

13228

/// Try to lower a vector shuffle as a byte rotation.

13229

///

13230

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

13231

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

13232

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

13233

/// try to generically lower a vector shuffle through such an pattern. It

13234

/// does not check for the profitability of lowering either as PALIGNR or

13235

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

13236

/// This matches shuffle vectors that look like:

13237

///

13238

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

13239

///

13240

/// Essentially it concatenates V1 and V2, shifts right by some number of

13241

/// elements, and takes the low elements as the result. Note that while this is

13242

/// specified as a *right shift* because x86 is little-endian, it is a *left

13243

/// rotate* of the vector lanes.

13244

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

13245

ArrayRef<int> Mask) {

13246

// Don't accept any shuffles with zero elements.

13247

if (isAnyZero(Mask))

13248

return -1;

13249

13250

// PALIGNR works on 128-bit lanes.

13251

SmallVector<int, 16> RepeatedMask;

13252

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

13253

return -1;

13254

13255

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

13256

if (Rotation <= 0)

13257

return -1;

13258

13259

// PALIGNR rotates bytes, so we need to scale the

13260

// rotation based on how many bytes are in the vector lane.

13261

int NumElts = RepeatedMask.size();

13262

int Scale = 16 / NumElts;

13263

return Rotation * Scale;

13264

}

13265

13266

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

13267

SDValue V2, ArrayRef<int> Mask,

13268

const X86Subtarget &Subtarget,

13269

SelectionDAG &DAG) {

13270

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13270, __extension__
__PRETTY_FUNCTION__));

13271

13272

SDValue Lo = V1, Hi = V2;

13273

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

13274

if (ByteRotation <= 0)

13275

return SDValue();

13276

13277

// Cast the inputs to i8 vector of correct length to match PALIGNR or

13278

// PSLLDQ/PSRLDQ.

13279

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13280

Lo = DAG.getBitcast(ByteVT, Lo);

13281

Hi = DAG.getBitcast(ByteVT, Hi);

13282

13283

// SSSE3 targets can use the palignr instruction.

13284

if (Subtarget.hasSSSE3()) {

13285

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13286, __extension__
__PRETTY_FUNCTION__))

13286

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13286, __extension__
__PRETTY_FUNCTION__));

13287

return DAG.getBitcast(

13288

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

13289

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

13290

}

13291

13292

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13293, __extension__
__PRETTY_FUNCTION__))

13293

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13293, __extension__
__PRETTY_FUNCTION__));

13294

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13295, __extension__
__PRETTY_FUNCTION__))

13295

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13295, __extension__
__PRETTY_FUNCTION__));

13296

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13297, __extension__
__PRETTY_FUNCTION__))

13297

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13297, __extension__
__PRETTY_FUNCTION__));

13298

13299

// Default SSE2 implementation

13300

int LoByteShift = 16 - ByteRotation;

13301

int HiByteShift = ByteRotation;

13302

13303

SDValue LoShift =

13304

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

13305

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

13306

SDValue HiShift =

13307

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

13308

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

13309

return DAG.getBitcast(VT,

13310

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

13311

}

13312

13313

/// Try to lower a vector shuffle as a dword/qword rotation.

13314

///

13315

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

13316

/// rotation of the concatenation of two vectors; This routine will

13317

/// try to generically lower a vector shuffle through such an pattern.

13318

///

13319

/// Essentially it concatenates V1 and V2, shifts right by some number of

13320

/// elements, and takes the low elements as the result. Note that while this is

13321

/// specified as a *right shift* because x86 is little-endian, it is a *left

13322

/// rotate* of the vector lanes.

13323

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

13324

SDValue V2, ArrayRef<int> Mask,

13325

const X86Subtarget &Subtarget,

13326

SelectionDAG &DAG) {

13327

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13328, __extension__
__PRETTY_FUNCTION__))

13328

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13328, __extension__
__PRETTY_FUNCTION__));

13329

13330

// 128/256-bit vectors are only supported with VLX.

13331

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13332, __extension__
__PRETTY_FUNCTION__))

13332

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13332, __extension__
__PRETTY_FUNCTION__));

13333

13334

SDValue Lo = V1, Hi = V2;

13335

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

13336

if (Rotation <= 0)

13337

return SDValue();

13338

13339

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

13340

DAG.getTargetConstant(Rotation, DL, MVT::i8));

13341

}

13342

13343

/// Try to lower a vector shuffle as a byte shift sequence.

13344

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

13345

SDValue V2, ArrayRef<int> Mask,

13346

const APInt &Zeroable,

13347

const X86Subtarget &Subtarget,

13348

SelectionDAG &DAG) {

13349

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13349, __extension__
__PRETTY_FUNCTION__));

13350

assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13350, __extension__
__PRETTY_FUNCTION__));

13351

13352

// We need a shuffle that has zeros at one/both ends and a sequential

13353

// shuffle from one source within.

13354

unsigned ZeroLo = Zeroable.countTrailingOnes();

13355

unsigned ZeroHi = Zeroable.countLeadingOnes();

13356

if (!ZeroLo && !ZeroHi)

13357

return SDValue();

13358

13359

unsigned NumElts = Mask.size();

13360

unsigned Len = NumElts - (ZeroLo + ZeroHi);

13361

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

13362

return SDValue();

13363

13364

unsigned Scale = VT.getScalarSizeInBits() / 8;

13365

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

13366

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

13367

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

13368

return SDValue();

13369

13370

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

13371

Res = DAG.getBitcast(MVT::v16i8, Res);

13372

13373

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

13374

// inner sequential set of elements, possibly offset:

13375

// 01234567 --> zzzzzz01 --> 1zzzzzzz

13376

// 01234567 --> 4567zzzz --> zzzzz456

13377

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

13378

if (ZeroLo == 0) {

13379

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13380

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13381

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13382

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13383

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

13384

} else if (ZeroHi == 0) {

13385

unsigned Shift = Mask[ZeroLo] % NumElts;

13386

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13387

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13388

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13389

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

13390

} else if (!Subtarget.hasSSSE3()) {

13391

// If we don't have PSHUFB then its worth avoiding an AND constant mask

13392

// by performing 3 byte shifts. Shuffle combining can kick in above that.

13393

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

13394

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13395

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13396

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13397

Shift += Mask[ZeroLo] % NumElts;

13398

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13399

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13400

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13401

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

13402

} else

13403

return SDValue();

13404

13405

return DAG.getBitcast(VT, Res);

13406

}

13407

13408

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

13409

///

13410

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

13411

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

13412

/// matches elements from one of the input vectors shuffled to the left or

13413

/// right with zeroable elements 'shifted in'. It handles both the strictly

13414

/// bit-wise element shifts and the byte shift across an entire 128-bit double

13415

/// quad word lane.

13416

///

13417

/// PSHL : (little-endian) left bit shift.

13418

/// [ zz, 0, zz, 2 ]

13419

/// [ -1, 4, zz, -1 ]

13420

/// PSRL : (little-endian) right bit shift.

13421

/// [ 1, zz, 3, zz]

13422

/// [ -1, -1, 7, zz]

13423

/// PSLLDQ : (little-endian) left byte shift

13424

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

13425

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

13426

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

13427

/// PSRLDQ : (little-endian) right byte shift

13428

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

13429

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

13430

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

13431

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

13432

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

13433

int MaskOffset, const APInt &Zeroable,

13434

const X86Subtarget &Subtarget) {

13435

int Size = Mask.size();

13436

unsigned SizeInBits = Size * ScalarSizeInBits;

13437

13438

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

13439

for (int i = 0; i < Size; i += Scale)

13440

for (int j = 0; j < Shift; ++j)

13441

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

13442

return false;

13443

13444

return true;

13445

};

13446

13447

auto MatchShift = [&](int Shift, int Scale, bool Left) {

13448

for (int i = 0; i != Size; i += Scale) {

13449

unsigned Pos = Left ? i + Shift : i;

13450

unsigned Low = Left ? i : i + Shift;

13451

unsigned Len = Scale - Shift;

13452

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

13453

return -1;

13454

}

13455

13456

int ShiftEltBits = ScalarSizeInBits * Scale;

13457

bool ByteShift = ShiftEltBits > 64;

13458

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

13459

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

13460

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

13461

13462

// Normalize the scale for byte shifts to still produce an i64 element

13463

// type.

13464

Scale = ByteShift ? Scale / 2 : Scale;

13465

13466

// We need to round trip through the appropriate type for the shift.

13467

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

13468

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

13469

: MVT::getVectorVT(ShiftSVT, Size / Scale);

13470

return (int)ShiftAmt;

13471

};

13472

13473

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

13474

// keep doubling the size of the integer elements up to that. We can

13475

// then shift the elements of the integer vector by whole multiples of

13476

// their width within the elements of the larger integer vector. Test each

13477

// multiple to see if we can find a match with the moved element indices

13478

// and that the shifted in elements are all zeroable.

13479

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

13480

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

13481

for (int Shift = 1; Shift != Scale; ++Shift)

13482

for (bool Left : {true, false})

13483

if (CheckZeros(Shift, Scale, Left)) {

13484

int ShiftAmt = MatchShift(Shift, Scale, Left);

13485

if (0 < ShiftAmt)

13486

return ShiftAmt;

13487

}

13488

13489

// no match

13490

return -1;

13491

}

13492

13493

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

13494

SDValue V2, ArrayRef<int> Mask,

13495

const APInt &Zeroable,

13496

const X86Subtarget &Subtarget,

13497

SelectionDAG &DAG) {

13498

int Size = Mask.size();

13499

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13499, __extension__
__PRETTY_FUNCTION__));

13500

13501

MVT ShiftVT;

13502

SDValue V = V1;

13503

unsigned Opcode;

13504

13505

// Try to match shuffle against V1 shift.

13506

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

13507

Mask, 0, Zeroable, Subtarget);

13508

13509

// If V1 failed, try to match shuffle against V2 shift.

13510

if (ShiftAmt < 0) {

13511

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

13512

Mask, Size, Zeroable, Subtarget);

13513

V = V2;

13514

}

13515

13516

if (ShiftAmt < 0)

13517

return SDValue();

13518

13519

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13520, __extension__
__PRETTY_FUNCTION__))

13520

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13520, __extension__
__PRETTY_FUNCTION__));

13521

V = DAG.getBitcast(ShiftVT, V);

13522

V = DAG.getNode(Opcode, DL, ShiftVT, V,

13523

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

13524

return DAG.getBitcast(VT, V);

13525

}

13526

13527

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

13528

// Remainder of lower half result is zero and upper half is all undef.

13529

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

13530

ArrayRef<int> Mask, uint64_t &BitLen,

13531

uint64_t &BitIdx, const APInt &Zeroable) {

13532

int Size = Mask.size();

13533

int HalfSize = Size / 2;

13534

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__));

13535

assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13535, __extension__
__PRETTY_FUNCTION__));

13536

13537

// Upper half must be undefined.

13538

if (!isUndefUpperHalf(Mask))

13539

return false;

13540

13541

// Determine the extraction length from the part of the

13542

// lower half that isn't zeroable.

13543

int Len = HalfSize;

13544

for (; Len > 0; --Len)

13545

if (!Zeroable[Len - 1])

13546

break;

13547

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13547, __extension__
__PRETTY_FUNCTION__));

13548

13549

// Attempt to match first Len sequential elements from the lower half.

13550

SDValue Src;

13551

int Idx = -1;

13552

for (int i = 0; i != Len; ++i) {

13553

int M = Mask[i];

13554

if (M == SM_SentinelUndef)

13555

continue;

13556

SDValue &V = (M < Size ? V1 : V2);

13557

M = M % Size;

13558

13559

// The extracted elements must start at a valid index and all mask

13560

// elements must be in the lower half.

13561

if (i > M || M >= HalfSize)

13562

return false;

13563

13564

if (Idx < 0 || (Src == V && Idx == (M - i))) {

13565

Src = V;

13566

Idx = M - i;

13567

continue;

13568

}

13569

return false;

13570

}

13571

13572

if (!Src || Idx < 0)

13573

return false;

13574

13575

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13575, __extension__
__PRETTY_FUNCTION__));

13576

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

13577

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

13578

V1 = Src;

13579

return true;

13580

}

13581

13582

// INSERTQ: Extract lowest Len elements from lower half of second source and

13583

// insert over first source, starting at Idx.

13584

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

13585

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

13586

ArrayRef<int> Mask, uint64_t &BitLen,

13587

uint64_t &BitIdx) {

13588

int Size = Mask.size();

13589

int HalfSize = Size / 2;

13590

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13590, __extension__
__PRETTY_FUNCTION__));

13591

13592

// Upper half must be undefined.

13593

if (!isUndefUpperHalf(Mask))

13594

return false;

13595

13596

for (int Idx = 0; Idx != HalfSize; ++Idx) {

13597

SDValue Base;

13598

13599

// Attempt to match first source from mask before insertion point.

13600

if (isUndefInRange(Mask, 0, Idx)) {

13601

/* EMPTY */

13602

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

13603

Base = V1;

13604

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

13605

Base = V2;

13606

} else {

13607

continue;

13608

}

13609

13610

// Extend the extraction length looking to match both the insertion of

13611

// the second source and the remaining elements of the first.

13612

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

13613

SDValue Insert;

13614

int Len = Hi - Idx;

13615

13616

// Match insertion.

13617

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

13618

Insert = V1;

13619

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

13620

Insert = V2;

13621

} else {

13622

continue;

13623

}

13624

13625

// Match the remaining elements of the lower half.

13626

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

13627

/* EMPTY */

13628

} else if ((!Base || (Base == V1)) &&

13629

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

13630

Base = V1;

13631

} else if ((!Base || (Base == V2)) &&

13632

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

13633

Size + Hi)) {

13634

Base = V2;

13635

} else {

13636

continue;

13637

}

13638

13639

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

13640

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

13641

V1 = Base;

13642

V2 = Insert;

13643

return true;

13644

}

13645

}

13646

13647

return false;

13648

}

13649

13650

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

13651

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

13652

SDValue V2, ArrayRef<int> Mask,

13653

const APInt &Zeroable, SelectionDAG &DAG) {

13654

uint64_t BitLen, BitIdx;

13655

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

13656

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

13657

DAG.getTargetConstant(BitLen, DL, MVT::i8),

13658

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

13659

13660

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

13661

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

13662

V2 ? V2 : DAG.getUNDEF(VT),

13663

DAG.getTargetConstant(BitLen, DL, MVT::i8),

13664

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

13665

13666

return SDValue();

13667

}

13668

13669

/// Lower a vector shuffle as a zero or any extension.

13670

///

13671

/// Given a specific number of elements, element bit width, and extension

13672

/// stride, produce either a zero or any extension based on the available

13673

/// features of the subtarget. The extended elements are consecutive and

13674

/// begin and can start from an offsetted element index in the input; to

13675

/// avoid excess shuffling the offset must either being in the bottom lane

13676

/// or at the start of a higher lane. All extended elements must be from

13677

/// the same lane.

13678

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

13679

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

13680

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13681

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13681, __extension__
__PRETTY_FUNCTION__));

13682

int EltBits = VT.getScalarSizeInBits();

13683

int NumElements = VT.getVectorNumElements();

13684

int NumEltsPerLane = 128 / EltBits;

13685

int OffsetLane = Offset / NumEltsPerLane;

13686

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13687, __extension__
__PRETTY_FUNCTION__))

13687

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13687, __extension__
__PRETTY_FUNCTION__));

13688

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13688, __extension__
__PRETTY_FUNCTION__));

13689

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13689, __extension__
__PRETTY_FUNCTION__));

13690

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13691, __extension__
__PRETTY_FUNCTION__))

13691

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13691, __extension__
__PRETTY_FUNCTION__));

13692

13693

// Check that an index is in same lane as the base offset.

13694

auto SafeOffset = [&](int Idx) {

13695

return OffsetLane == (Idx / NumEltsPerLane);

13696

};

13697

13698

// Shift along an input so that the offset base moves to the first element.

13699

auto ShuffleOffset = [&](SDValue V) {

13700

if (!Offset)

13701

return V;

13702

13703

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

13704

for (int i = 0; i * Scale < NumElements; ++i) {

13705

int SrcIdx = i + Offset;

13706

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

13707

}

13708

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

13709

};

13710

13711

// Found a valid a/zext mask! Try various lowering strategies based on the

13712

// input type and available ISA extensions.

13713

if (Subtarget.hasSSE41()) {

13714

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

13715

// PUNPCK will catch this in a later shuffle match.

13716

if (Offset && Scale == 2 && VT.is128BitVector())

13717

return SDValue();

13718

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

13719

NumElements / Scale);

13720

InputV = ShuffleOffset(InputV);

13721

InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,

13722

DL, ExtVT, InputV, DAG);

13723

return DAG.getBitcast(VT, InputV);

13724

}

13725

13726

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13726, __extension__
__PRETTY_FUNCTION__));

13727

13728

// For any extends we can cheat for larger element sizes and use shuffle

13729

// instructions that can fold with a load and/or copy.

13730

if (AnyExt && EltBits == 32) {

13731

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

13732

-1};

13733

return DAG.getBitcast(

13734

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

13735

DAG.getBitcast(MVT::v4i32, InputV),

13736

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

13737

}

13738

if (AnyExt && EltBits == 16 && Scale > 2) {

13739

int PSHUFDMask[4] = {Offset / 2, -1,

13740

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

13741

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

13742

DAG.getBitcast(MVT::v4i32, InputV),

13743

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

13744

int PSHUFWMask[4] = {1, -1, -1, -1};

13745

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

13746

return DAG.getBitcast(

13747

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

13748

DAG.getBitcast(MVT::v8i16, InputV),

13749

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

13750

}

13751

13752

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

13753

// to 64-bits.

13754

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

13755

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13755, __extension__
__PRETTY_FUNCTION__));

13756

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13756, __extension__
__PRETTY_FUNCTION__));

13757

13758

int LoIdx = Offset * EltBits;

13759

SDValue Lo = DAG.getBitcast(

13760

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

13761

DAG.getTargetConstant(EltBits, DL, MVT::i8),

13762

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

13763

13764

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

13765

return DAG.getBitcast(VT, Lo);

13766

13767

int HiIdx = (Offset + 1) * EltBits;

13768

SDValue Hi = DAG.getBitcast(

13769

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

13770

DAG.getTargetConstant(EltBits, DL, MVT::i8),

13771

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

13772

return DAG.getBitcast(VT,

13773

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

13774

}

13775

13776

// If this would require more than 2 unpack instructions to expand, use

13777

// pshufb when available. We can only use more than 2 unpack instructions

13778

// when zero extending i8 elements which also makes it easier to use pshufb.

13779

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

13780

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13780, __extension__
__PRETTY_FUNCTION__));

13781

SDValue PSHUFBMask[16];

13782

for (int i = 0; i < 16; ++i) {

13783

int Idx = Offset + (i / Scale);

13784

if ((i % Scale == 0 && SafeOffset(Idx))) {

13785

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

13786

continue;

13787

}

13788

PSHUFBMask[i] =

13789

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

13790

}

13791

InputV = DAG.getBitcast(MVT::v16i8, InputV);

13792

return DAG.getBitcast(

13793

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

13794

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

13795

}

13796

13797

// If we are extending from an offset, ensure we start on a boundary that

13798

// we can unpack from.

13799

int AlignToUnpack = Offset % (NumElements / Scale);

13800

if (AlignToUnpack) {

13801

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

13802

for (int i = AlignToUnpack; i < NumElements; ++i)

13803

ShMask[i - AlignToUnpack] = i;

13804

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

13805

Offset -= AlignToUnpack;

13806

}

13807

13808

// Otherwise emit a sequence of unpacks.

13809

do {

13810

unsigned UnpackLoHi = X86ISD::UNPCKL;

13811

if (Offset >= (NumElements / 2)) {

13812

UnpackLoHi = X86ISD::UNPCKH;

13813

Offset -= (NumElements / 2);

13814

}

13815

13816

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

13817

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

13818

: getZeroVector(InputVT, Subtarget, DAG, DL);

13819

InputV = DAG.getBitcast(InputVT, InputV);

13820

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

13821

Scale /= 2;

13822

EltBits *= 2;

13823

NumElements /= 2;

13824

} while (Scale > 1);

13825

return DAG.getBitcast(VT, InputV);

13826

}

13827

13828

/// Try to lower a vector shuffle as a zero extension on any microarch.

13829

///

13830

/// This routine will try to do everything in its power to cleverly lower

13831

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

13832

/// check for the profitability of this lowering, it tries to aggressively

13833

/// match this pattern. It will use all of the micro-architectural details it

13834

/// can to emit an efficient lowering. It handles both blends with all-zero

13835

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

13836

/// masking out later).

13837

///

13838

/// The reason we have dedicated lowering for zext-style shuffles is that they

13839

/// are both incredibly common and often quite performance sensitive.

13840

static SDValue lowerShuffleAsZeroOrAnyExtend(

13841

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13842

const APInt &Zeroable, const X86Subtarget &Subtarget,

13843

SelectionDAG &DAG) {

13844

int Bits = VT.getSizeInBits();

13845

int NumLanes = Bits / 128;

13846

int NumElements = VT.getVectorNumElements();

13847

int NumEltsPerLane = NumElements / NumLanes;

13848

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__))

13849

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__));

13850

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13850, __extension__
__PRETTY_FUNCTION__));

13851

13852

// Define a helper function to check a particular ext-scale and lower to it if

13853

// valid.

13854

auto Lower = [&](int Scale) -> SDValue {

13855

SDValue InputV;

13856

bool AnyExt = true;

13857

int Offset = 0;

13858

int Matches = 0;

13859

for (int i = 0; i < NumElements; ++i) {

13860

int M = Mask[i];

13861

if (M < 0)

13862

continue; // Valid anywhere but doesn't tell us anything.

13863

if (i % Scale != 0) {

13864

// Each of the extended elements need to be zeroable.

13865

if (!Zeroable[i])

13866

return SDValue();

13867

13868

// We no longer are in the anyext case.

13869

AnyExt = false;

13870

continue;

13871

}

13872

13873

// Each of the base elements needs to be consecutive indices into the

13874

// same input vector.

13875

SDValue V = M < NumElements ? V1 : V2;

13876

M = M % NumElements;

13877

if (!InputV) {

13878

InputV = V;

13879

Offset = M - (i / Scale);

13880

} else if (InputV != V)

13881

return SDValue(); // Flip-flopping inputs.

13882

13883

// Offset must start in the lowest 128-bit lane or at the start of an

13884

// upper lane.

13885

// FIXME: Is it ever worth allowing a negative base offset?

13886

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

13887

(Offset % NumEltsPerLane) == 0))

13888

return SDValue();

13889

13890

// If we are offsetting, all referenced entries must come from the same

13891

// lane.

13892

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

13893

return SDValue();

13894

13895

if ((M % NumElements) != (Offset + (i / Scale)))

13896

return SDValue(); // Non-consecutive strided elements.

13897

Matches++;

13898

}

13899

13900

// If we fail to find an input, we have a zero-shuffle which should always

13901

// have already been handled.

13902

// FIXME: Maybe handle this here in case during blending we end up with one?

13903

if (!InputV)

13904

return SDValue();

13905

13906

// If we are offsetting, don't extend if we only match a single input, we

13907

// can always do better by using a basic PSHUF or PUNPCK.

13908

if (Offset != 0 && Matches < 2)

13909

return SDValue();

13910

13911

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

13912

InputV, Mask, Subtarget, DAG);

13913

};

13914

13915

// The widest scale possible for extending is to a 64-bit integer.

13916

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13917, __extension__
__PRETTY_FUNCTION__))

13917

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13917, __extension__
__PRETTY_FUNCTION__));

13918

int NumExtElements = Bits / 64;

13919

13920

// Each iteration, try extending the elements half as much, but into twice as

13921

// many elements.

13922

for (; NumExtElements < NumElements; NumExtElements *= 2) {

13923

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13924, __extension__
__PRETTY_FUNCTION__))

13924

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13924, __extension__
__PRETTY_FUNCTION__));

13925

if (SDValue V = Lower(NumElements / NumExtElements))

13926

return V;

13927

}

13928

13929

// General extends failed, but 128-bit vectors may be able to use MOVQ.

13930

if (Bits != 128)

13931

return SDValue();

13932

13933

// Returns one of the source operands if the shuffle can be reduced to a

13934

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

13935

auto CanZExtLowHalf = [&]() {

13936

for (int i = NumElements / 2; i != NumElements; ++i)

13937

if (!Zeroable[i])

13938

return SDValue();

13939

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

13940

return V1;

13941

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

13942

return V2;

13943

return SDValue();

13944

};

13945

13946

if (SDValue V = CanZExtLowHalf()) {

13947

V = DAG.getBitcast(MVT::v2i64, V);

13948

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

13949

return DAG.getBitcast(VT, V);

13950

}

13951

13952

// No viable ext lowering found.

13953

return SDValue();

13954

}

13955

13956

/// Try to get a scalar value for a specific element of a vector.

13957

///

13958

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

13959

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

13960

SelectionDAG &DAG) {

13961

MVT VT = V.getSimpleValueType();

13962

MVT EltVT = VT.getVectorElementType();

13963

V = peekThroughBitcasts(V);

13964

13965

// If the bitcasts shift the element size, we can't extract an equivalent

13966

// element from it.

13967

MVT NewVT = V.getSimpleValueType();

13968

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

13969

return SDValue();

13970

13971

if (V.getOpcode() == ISD::BUILD_VECTOR ||

13972

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

13973

// Ensure the scalar operand is the same size as the destination.

13974

// FIXME: Add support for scalar truncation where possible.

13975

SDValue S = V.getOperand(Idx);

13976

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

13977

return DAG.getBitcast(EltVT, S);

13978

}

13979

13980

return SDValue();

13981

}

13982

13983

/// Helper to test for a load that can be folded with x86 shuffles.

13984

///

13985

/// This is particularly important because the set of instructions varies

13986

/// significantly based on whether the operand is a load or not.

13987

static bool isShuffleFoldableLoad(SDValue V) {

13988

V = peekThroughBitcasts(V);

13989

return ISD::isNON_EXTLoad(V.getNode());

13990

}

13991

13992

/// Try to lower insertion of a single element into a zero vector.

13993

///

13994

/// This is a common pattern that we have especially efficient patterns to lower

13995

/// across all subtarget feature sets.

13996

static SDValue lowerShuffleAsElementInsertion(

13997

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13998

const APInt &Zeroable, const X86Subtarget &Subtarget,

13999

SelectionDAG &DAG) {

14000

MVT ExtVT = VT;

14001

MVT EltVT = VT.getVectorElementType();

14002

14003

int V2Index =

14004

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

14005

Mask.begin();

14006

bool IsV1Zeroable = true;

14007

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14008

if (i != V2Index && !Zeroable[i]) {

14009

IsV1Zeroable = false;

14010

break;

14011

}

14012

14013

// Check for a single input from a SCALAR_TO_VECTOR node.

14014

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

14015

// all the smarts here sunk into that routine. However, the current

14016

// lowering of BUILD_VECTOR makes that nearly impossible until the old

14017

// vector shuffle lowering is dead.

14018

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

14019

DAG);

14020

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

14021

// We need to zext the scalar if it is smaller than an i32.

14022

V2S = DAG.getBitcast(EltVT, V2S);

14023

if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

14024

// Using zext to expand a narrow element won't work for non-zero

14025

// insertions.

14026

if (!IsV1Zeroable)

14027

return SDValue();

14028

14029

// Zero-extend directly to i32.

14030

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

14031

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

14032

}

14033

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14034

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

14035

EltVT == MVT::i16) {

14036

// Either not inserting from the low element of the input or the input

14037

// element size is too small to use VZEXT_MOVL to clear the high bits.

14038

return SDValue();

14039

}

14040

14041

if (!IsV1Zeroable) {

14042

// If V1 can't be treated as a zero vector we have fewer options to lower

14043

// this. We can't support integer vectors or non-zero targets cheaply, and

14044

// the V1 elements can't be permuted in any way.

14045

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14045, __extension__
__PRETTY_FUNCTION__));

14046

if (!VT.isFloatingPoint() || V2Index != 0)

14047

return SDValue();

14048

SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());

14049

V1Mask[V2Index] = -1;

14050

if (!isNoopShuffleMask(V1Mask))

14051

return SDValue();

14052

if (!VT.is128BitVector())

14053

return SDValue();

14054

14055

// Otherwise, use MOVSD, MOVSS or MOVSH.

14056

unsigned MovOpc = 0;

14057

if (EltVT == MVT::f16)

14058

MovOpc = X86ISD::MOVSH;

14059

else if (EltVT == MVT::f32)

14060

MovOpc = X86ISD::MOVSS;

14061

else if (EltVT == MVT::f64)

14062

MovOpc = X86ISD::MOVSD;

14063

else

14064

llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14064);

14065

return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

14066

}

14067

14068

// This lowering only works for the low element with floating point vectors.

14069

if (VT.isFloatingPoint() && V2Index != 0)

14070

return SDValue();

14071

14072

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

14073

if (ExtVT != VT)

14074

V2 = DAG.getBitcast(VT, V2);

14075

14076

if (V2Index != 0) {

14077

// If we have 4 or fewer lanes we can cheaply shuffle the element into

14078

// the desired position. Otherwise it is more efficient to do a vector

14079

// shift left. We know that we can do a vector shift left because all

14080

// the inputs are zero.

14081

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

14082

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

14083

V2Shuffle[V2Index] = 0;

14084

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

14085

} else {

14086

V2 = DAG.getBitcast(MVT::v16i8, V2);

14087

V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

14088

DAG.getTargetConstant(

14089

V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));

14090

V2 = DAG.getBitcast(VT, V2);

14091

}

14092

}

14093

return V2;

14094

}

14095

14096

/// Try to lower broadcast of a single - truncated - integer element,

14097

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

14098

///

14099

/// This assumes we have AVX2.

14100

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

14101

int BroadcastIdx,

14102

const X86Subtarget &Subtarget,

14103

SelectionDAG &DAG) {

14104

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14105, __extension__
__PRETTY_FUNCTION__))

14105

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14105, __extension__
__PRETTY_FUNCTION__));

14106

14107

MVT EltVT = VT.getVectorElementType();

14108

MVT V0VT = V0.getSimpleValueType();

14109

14110

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14110, __extension__
__PRETTY_FUNCTION__));

14111

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14111, __extension__
__PRETTY_FUNCTION__));

14112

14113

MVT V0EltVT = V0VT.getVectorElementType();

14114

if (!V0EltVT.isInteger())

14115

return SDValue();

14116

14117

const unsigned EltSize = EltVT.getSizeInBits();

14118

const unsigned V0EltSize = V0EltVT.getSizeInBits();

14119

14120

// This is only a truncation if the original element type is larger.

14121

if (V0EltSize <= EltSize)

14122

return SDValue();

14123

14124

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))

14125

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__));

14126

14127

const unsigned V0Opc = V0.getOpcode();

14128

const unsigned Scale = V0EltSize / EltSize;

14129

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

14130

14131

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

14132

V0Opc != ISD::BUILD_VECTOR)

14133

return SDValue();

14134

14135

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

14136

14137

// If we're extracting non-least-significant bits, shift so we can truncate.

14138

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

14139

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

14140

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

14141

if (const int OffsetIdx = BroadcastIdx % Scale)

14142

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

14143

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

14144

14145

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

14146

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

14147

}

14148

14149

/// Test whether this can be lowered with a single SHUFPS instruction.

14150

///

14151

/// This is used to disable more specialized lowerings when the shufps lowering

14152

/// will happen to be efficient.

14153

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

14154

// This routine only handles 128-bit shufps.

14155

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14155, __extension__
__PRETTY_FUNCTION__));

14156

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__));

14157

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14157, __extension__
__PRETTY_FUNCTION__));

14158

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14158, __extension__
__PRETTY_FUNCTION__));

14159

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14159, __extension__
__PRETTY_FUNCTION__));

14160

14161

// To lower with a single SHUFPS we need to have the low half and high half

14162

// each requiring a single input.

14163

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

14164

return false;

14165

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

14166

return false;

14167

14168

return true;

14169

}

14170

14171

/// If we are extracting two 128-bit halves of a vector and shuffling the

14172

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

14173

/// multi-shuffle lowering.

14174

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

14175

SDValue N1, ArrayRef<int> Mask,

14176

SelectionDAG &DAG) {

14177

MVT VT = N0.getSimpleValueType();

14178

assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))

14179

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))

14180

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__));

14181

14182

// Check that both sources are extracts of the same source vector.

14183

if (!N0.hasOneUse() || !N1.hasOneUse() ||

14184

N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14185

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14186

N0.getOperand(0) != N1.getOperand(0))

14187

return SDValue();

14188

14189

SDValue WideVec = N0.getOperand(0);

14190

MVT WideVT = WideVec.getSimpleValueType();

14191

if (!WideVT.is256BitVector())

14192

return SDValue();

14193

14194

// Match extracts of each half of the wide source vector. Commute the shuffle

14195

// if the extract of the low half is N1.

14196

unsigned NumElts = VT.getVectorNumElements();

14197

SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());

14198

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

14199

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

14200

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

14201

ShuffleVectorSDNode::commuteMask(NewMask);

14202

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

14203

return SDValue();

14204

14205

// Final bailout: if the mask is simple, we are better off using an extract

14206

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

14207

// because that avoids a constant load from memory.

14208

if (NumElts == 4 &&

14209

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))

14210

return SDValue();

14211

14212

// Extend the shuffle mask with undef elements.

14213

NewMask.append(NumElts, -1);

14214

14215

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

14216

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

14217

NewMask);

14218

// This is free: ymm -> xmm.

14219

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

14220

DAG.getIntPtrConstant(0, DL));

14221

}

14222

14223

/// Try to lower broadcast of a single element.

14224

///

14225

/// For convenience, this code also bundles all of the subtarget feature set

14226

/// filtering. While a little annoying to re-dispatch on type here, there isn't

14227

/// a convenient way to factor it out.

14228

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

14229

SDValue V2, ArrayRef<int> Mask,

14230

const X86Subtarget &Subtarget,

14231

SelectionDAG &DAG) {

14232

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

14233

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

14234

(Subtarget.hasAVX2() && VT.isInteger())))

14235

return SDValue();

14236

14237

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

14238

// we can only broadcast from a register with AVX2.

14239

unsigned NumEltBits = VT.getScalarSizeInBits();

14240

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

14241

? X86ISD::MOVDDUP

14242

: X86ISD::VBROADCAST;

14243

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

14244

14245

// Check that the mask is a broadcast.

14246

int BroadcastIdx = getSplatIndex(Mask);

14247

if (BroadcastIdx < 0)

14248

return SDValue();

14249

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))

14250

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__))

14251

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14251, __extension__
__PRETTY_FUNCTION__));

14252

14253

// Go up the chain of (vector) values to find a scalar load that we can

14254

// combine with the broadcast.

14255

// TODO: Combine this logic with findEltLoadSrc() used by

14256

// EltsFromConsecutiveLoads().

14257

int BitOffset = BroadcastIdx * NumEltBits;

14258

SDValue V = V1;

14259

for (;;) {

14260

switch (V.getOpcode()) {

14261

case ISD::BITCAST: {

14262

V = V.getOperand(0);

14263

continue;

14264

}

14265

case ISD::CONCAT_VECTORS: {

14266

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

14267

int OpIdx = BitOffset / OpBitWidth;

14268

V = V.getOperand(OpIdx);

14269

BitOffset %= OpBitWidth;

14270

continue;

14271

}

14272

case ISD::EXTRACT_SUBVECTOR: {

14273

// The extraction index adds to the existing offset.

14274

unsigned EltBitWidth = V.getScalarValueSizeInBits();

14275

unsigned Idx = V.getConstantOperandVal(1);

14276

unsigned BeginOffset = Idx * EltBitWidth;

14277

BitOffset += BeginOffset;

14278

V = V.getOperand(0);

14279

continue;

14280

}

14281

case ISD::INSERT_SUBVECTOR: {

14282

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

14283

int EltBitWidth = VOuter.getScalarValueSizeInBits();

14284

int Idx = (int)V.getConstantOperandVal(2);

14285

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

14286

int BeginOffset = Idx * EltBitWidth;

14287

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

14288

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

14289

BitOffset -= BeginOffset;

14290

V = VInner;

14291

} else {

14292

V = VOuter;

14293

}

14294

continue;

14295

}

14296

}

14297

break;

14298

}

14299

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14299, __extension__
__PRETTY_FUNCTION__));

14300

BroadcastIdx = BitOffset / NumEltBits;

14301

14302

// Do we need to bitcast the source to retrieve the original broadcast index?

14303

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

14304

14305

// Check if this is a broadcast of a scalar. We special case lowering

14306

// for scalars so that we can more effectively fold with loads.

14307

// If the original value has a larger element type than the shuffle, the

14308

// broadcast element is in essence truncated. Make that explicit to ease

14309

// folding.

14310

if (BitCastSrc && VT.isInteger())

14311

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

14312

DL, VT, V, BroadcastIdx, Subtarget, DAG))

14313

return TruncBroadcast;

14314

14315

// Also check the simpler case, where we can directly reuse the scalar.

14316

if (!BitCastSrc &&

14317

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

14318

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

14319

V = V.getOperand(BroadcastIdx);

14320

14321

// If we can't broadcast from a register, check that the input is a load.

14322

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

14323

return SDValue();

14324

} else if (ISD::isNormalLoad(V.getNode()) &&

14325

cast<LoadSDNode>(V)->isSimple()) {

14326

// We do not check for one-use of the vector load because a broadcast load

14327

// is expected to be a win for code size, register pressure, and possibly

14328

// uops even if the original vector load is not eliminated.

14329

14330

// Reduce the vector load and shuffle to a broadcasted scalar load.

14331

LoadSDNode *Ld = cast<LoadSDNode>(V);

14332

SDValue BaseAddr = Ld->getOperand(1);

14333

MVT SVT = VT.getScalarType();

14334

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

14335

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14335, __extension__
__PRETTY_FUNCTION__));

14336

SDValue NewAddr =

14337

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

14338

14339

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

14340

// than MOVDDUP.

14341

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

14342

if (Opcode == X86ISD::VBROADCAST) {

14343

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

14344

SDValue Ops[] = {Ld->getChain(), NewAddr};

14345

V = DAG.getMemIntrinsicNode(

14346

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

14347

DAG.getMachineFunction().getMachineMemOperand(

14348

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14349

DAG.makeEquivalentMemoryOrdering(Ld, V);

14350

return DAG.getBitcast(VT, V);

14351

}

14352

assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14352, __extension__
__PRETTY_FUNCTION__));

14353

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

14354

DAG.getMachineFunction().getMachineMemOperand(

14355

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14356

DAG.makeEquivalentMemoryOrdering(Ld, V);

14357

} else if (!BroadcastFromReg) {

14358

// We can't broadcast from a vector register.

14359

return SDValue();

14360

} else if (BitOffset != 0) {

14361

// We can only broadcast from the zero-element of a vector register,

14362

// but it can be advantageous to broadcast from the zero-element of a

14363

// subvector.

14364

if (!VT.is256BitVector() && !VT.is512BitVector())

14365

return SDValue();

14366

14367

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

14368

if (VT == MVT::v4f64 || VT == MVT::v4i64)

14369

return SDValue();

14370

14371

// Only broadcast the zero-element of a 128-bit subvector.

14372

if ((BitOffset % 128) != 0)

14373

return SDValue();

14374

14375

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14376, __extension__
__PRETTY_FUNCTION__))

14376

"Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14376, __extension__
__PRETTY_FUNCTION__));

14377

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14378, __extension__
__PRETTY_FUNCTION__))

14378

"Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14378, __extension__
__PRETTY_FUNCTION__));

14379

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

14380

V = extract128BitVector(V, ExtractIdx, DAG, DL);

14381

}

14382

14383

// On AVX we can use VBROADCAST directly for scalar sources.

14384

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

14385

V = DAG.getBitcast(MVT::f64, V);

14386

if (Subtarget.hasAVX()) {

14387

V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

14388

return DAG.getBitcast(VT, V);

14389

}

14390

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

14391

}

14392

14393

// If this is a scalar, do the broadcast on this type and bitcast.

14394

if (!V.getValueType().isVector()) {

14395

assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14396, __extension__
__PRETTY_FUNCTION__))

14396

"Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14396, __extension__
__PRETTY_FUNCTION__));

14397

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

14398

VT.getVectorNumElements());

14399

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

14400

}

14401

14402

// We only support broadcasting from 128-bit vectors to minimize the

14403

// number of patterns we need to deal with in isel. So extract down to

14404

// 128-bits, removing as many bitcasts as possible.

14405

if (V.getValueSizeInBits() > 128)

14406

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

14407

14408

// Otherwise cast V to a vector with the same element type as VT, but

14409

// possibly narrower than VT. Then perform the broadcast.

14410

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

14411

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

14412

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

14413

}

14414

14415

// Check for whether we can use INSERTPS to perform the shuffle. We only use

14416

// INSERTPS when the V1 elements are already in the correct locations

14417

// because otherwise we can just always use two SHUFPS instructions which

14418

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

14419

// perform INSERTPS if a single V1 element is out of place and all V2

14420

// elements are zeroable.

14421

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

14422

unsigned &InsertPSMask,

14423

const APInt &Zeroable,

14424

ArrayRef<int> Mask, SelectionDAG &DAG) {

14425

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14425, __extension__
__PRETTY_FUNCTION__));

14426

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14426, __extension__
__PRETTY_FUNCTION__));

14427

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14427, __extension__
__PRETTY_FUNCTION__));

14428

14429

// Attempt to match INSERTPS with one element from VA or VB being

14430

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

14431

// are updated.

14432

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

14433

ArrayRef<int> CandidateMask) {

14434

unsigned ZMask = 0;

14435

int VADstIndex = -1;

14436

int VBDstIndex = -1;

14437

bool VAUsedInPlace = false;

14438

14439

for (int i = 0; i < 4; ++i) {

14440

// Synthesize a zero mask from the zeroable elements (includes undefs).

14441

if (Zeroable[i]) {

14442

ZMask |= 1 << i;

14443

continue;

14444

}

14445

14446

// Flag if we use any VA inputs in place.

14447

if (i == CandidateMask[i]) {

14448

VAUsedInPlace = true;

14449

continue;

14450

}

14451

14452

// We can only insert a single non-zeroable element.

14453

if (VADstIndex >= 0 || VBDstIndex >= 0)

14454

return false;

14455

14456

if (CandidateMask[i] < 4) {

14457

// VA input out of place for insertion.

14458

VADstIndex = i;

14459

} else {

14460

// VB input for insertion.

14461

VBDstIndex = i;

14462

}

14463

}

14464

14465

// Don't bother if we have no (non-zeroable) element for insertion.

14466

if (VADstIndex < 0 && VBDstIndex < 0)

14467

return false;

14468

14469

// Determine element insertion src/dst indices. The src index is from the

14470

// start of the inserted vector, not the start of the concatenated vector.

14471

unsigned VBSrcIndex = 0;

14472

if (VADstIndex >= 0) {

14473

// If we have a VA input out of place, we use VA as the V2 element

14474

// insertion and don't use the original V2 at all.

14475

VBSrcIndex = CandidateMask[VADstIndex];

14476

VBDstIndex = VADstIndex;

14477

VB = VA;

14478

} else {

14479

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

14480

}

14481

14482

// If no V1 inputs are used in place, then the result is created only from

14483

// the zero mask and the V2 insertion - so remove V1 dependency.

14484

if (!VAUsedInPlace)

14485

VA = DAG.getUNDEF(MVT::v4f32);

14486

14487

// Update V1, V2 and InsertPSMask accordingly.

14488

V1 = VA;

14489

V2 = VB;

14490

14491

// Insert the V2 element into the desired position.

14492

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

14493

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14493, __extension__
__PRETTY_FUNCTION__));

14494

return true;

14495

};

14496

14497

if (matchAsInsertPS(V1, V2, Mask))

14498

return true;

14499

14500

// Commute and try again.

14501

SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());

14502

ShuffleVectorSDNode::commuteMask(CommutedMask);

14503

if (matchAsInsertPS(V2, V1, CommutedMask))

14504

return true;

14505

14506

return false;

14507

}

14508

14509

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

14510

ArrayRef<int> Mask, const APInt &Zeroable,

14511

SelectionDAG &DAG) {

14512

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14512, __extension__
__PRETTY_FUNCTION__));

14513

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14513, __extension__
__PRETTY_FUNCTION__));

14514

14515

// Attempt to match the insertps pattern.

14516

unsigned InsertPSMask = 0;

14517

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

14518

return SDValue();

14519

14520

// Insert the V2 element into the desired position.

14521

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

14522

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

14523

}

14524

14525

/// Try to lower a shuffle as a permute of the inputs followed by an

14526

/// UNPCK instruction.

14527

///

14528

/// This specifically targets cases where we end up with alternating between

14529

/// the two inputs, and so can permute them into something that feeds a single

14530

/// UNPCK instruction. Note that this routine only targets integer vectors

14531

/// because for floating point vectors we have a generalized SHUFPS lowering

14532

/// strategy that handles everything that doesn't *exactly* match an unpack,

14533

/// making this clever lowering unnecessary.

14534

static SDValue lowerShuffleAsPermuteAndUnpack(

14535

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14536

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14537

assert(!VT.isFloatingPoint() &&(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14538, __extension__
__PRETTY_FUNCTION__))

14538

"This routine only supports integer vectors.")(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14538, __extension__
__PRETTY_FUNCTION__));

14539

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14540, __extension__
__PRETTY_FUNCTION__))

14540

"This routine only works on 128-bit vectors.")(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14540, __extension__
__PRETTY_FUNCTION__));

14541

assert(!V2.isUndef() &&(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14542, __extension__
__PRETTY_FUNCTION__))

14542

"This routine should only be used when blending two inputs.")(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14542, __extension__
__PRETTY_FUNCTION__));

14543

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14543, __extension__
__PRETTY_FUNCTION__));

14544

14545

int Size = Mask.size();

14546

14547

int NumLoInputs =

14548

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

14549

int NumHiInputs =

14550

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

14551

14552

bool UnpackLo = NumLoInputs >= NumHiInputs;

14553

14554

auto TryUnpack = [&](int ScalarSize, int Scale) {

14555

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

14556

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

14557

14558

for (int i = 0; i < Size; ++i) {

14559

if (Mask[i] < 0)

14560

continue;

14561

14562

// Each element of the unpack contains Scale elements from this mask.

14563

int UnpackIdx = i / Scale;

14564

14565

// We only handle the case where V1 feeds the first slots of the unpack.

14566

// We rely on canonicalization to ensure this is the case.

14567

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

14568

return SDValue();

14569

14570

// Setup the mask for this input. The indexing is tricky as we have to

14571

// handle the unpack stride.

14572

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

14573

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

14574

Mask[i] % Size;

14575

}

14576

14577

// If we will have to shuffle both inputs to use the unpack, check whether

14578

// we can just unpack first and shuffle the result. If so, skip this unpack.

14579

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

14580

!isNoopShuffleMask(V2Mask))

14581

return SDValue();

14582

14583

// Shuffle the inputs into place.

14584

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

14585

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

14586

14587

// Cast the inputs to the type we will use to unpack them.

14588

MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

14589

V1 = DAG.getBitcast(UnpackVT, V1);

14590

V2 = DAG.getBitcast(UnpackVT, V2);

14591

14592

// Unpack the inputs and cast the result back to the desired type.

14593

return DAG.getBitcast(

14594

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

14595

UnpackVT, V1, V2));

14596

};

14597

14598

// We try each unpack from the largest to the smallest to try and find one

14599

// that fits this mask.

14600

int OrigScalarSize = VT.getScalarSizeInBits();

14601

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

14602

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

14603

return Unpack;

14604

14605

// If we're shuffling with a zero vector then we're better off not doing

14606

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

14607

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

14608

ISD::isBuildVectorAllZeros(V2.getNode()))

14609

return SDValue();

14610

14611

// If none of the unpack-rooted lowerings worked (or were profitable) try an

14612

// initial unpack.

14613

if (NumLoInputs == 0 || NumHiInputs == 0) {

14614

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14615, __extension__
__PRETTY_FUNCTION__))

14615

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14615, __extension__
__PRETTY_FUNCTION__));

14616

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

14617

14618

// FIXME: We could consider the total complexity of the permute of each

14619

// possible unpacking. Or at the least we should consider how many

14620

// half-crossings are created.

14621

// FIXME: We could consider commuting the unpacks.

14622

14623

SmallVector<int, 32> PermMask((unsigned)Size, -1);

14624

for (int i = 0; i < Size; ++i) {

14625

if (Mask[i] < 0)

14626

continue;

14627

14628

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14628, __extension__
__PRETTY_FUNCTION__));

14629

14630

PermMask[i] =

14631

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

14632

}

14633

return DAG.getVectorShuffle(

14634

VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,

14635

DL, VT, V1, V2),

14636

DAG.getUNDEF(VT), PermMask);

14637

}

14638

14639

return SDValue();

14640

}

14641

14642

/// Handle lowering of 2-lane 64-bit floating point shuffles.

14643

///

14644

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

14645

/// support for floating point shuffles but not integer shuffles. These

14646

/// instructions will incur a domain crossing penalty on some chips though so

14647

/// it is better to avoid lowering through this for integer vectors where

14648

/// possible.

14649

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

14650

const APInt &Zeroable, SDValue V1, SDValue V2,

14651

const X86Subtarget &Subtarget,

14652

SelectionDAG &DAG) {

14653

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14653, __extension__
__PRETTY_FUNCTION__));

14654

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14654, __extension__
__PRETTY_FUNCTION__));

14655

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14655, __extension__
__PRETTY_FUNCTION__));

14656

14657

if (V2.isUndef()) {

14658

// Check for being able to broadcast a single element.

14659

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

14660

Mask, Subtarget, DAG))

14661

return Broadcast;

14662

14663

// Straight shuffle of a single input vector. Simulate this by using the

14664

// single input as both of the "inputs" to this instruction..

14665

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

14666

14667

if (Subtarget.hasAVX()) {

14668

// If we have AVX, we can use VPERMILPS which will allow folding a load

14669

// into the shuffle.

14670

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

14671

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

14672

}

14673

14674

return DAG.getNode(

14675

X86ISD::SHUFP, DL, MVT::v2f64,

14676

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

14677

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

14678

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

14679

}

14680

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14680, __extension__
__PRETTY_FUNCTION__));

14681

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14681, __extension__
__PRETTY_FUNCTION__));

14682

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14682, __extension__
__PRETTY_FUNCTION__));

14683

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14683, __extension__
__PRETTY_FUNCTION__));

14684

14685

if (Subtarget.hasAVX2())

14686

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

14687

return Extract;

14688

14689

// When loading a scalar and then shuffling it into a vector we can often do

14690

// the insertion cheaply.

14691

if (SDValue Insertion = lowerShuffleAsElementInsertion(

14692

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

14693

return Insertion;

14694

// Try inverting the insertion since for v2 masks it is easy to do and we

14695

// can't reliably sort the mask one way or the other.

14696

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

14697

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

14698

if (SDValue Insertion = lowerShuffleAsElementInsertion(

14699

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

14700

return Insertion;

14701

14702

// Try to use one of the special instruction patterns to handle two common

14703

// blend patterns if a zero-blend above didn't work.

14704

if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

14705

isShuffleEquivalent(Mask, {1, 3}, V1, V2))

14706

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

14707

// We can either use a special instruction to load over the low double or

14708

// to move just the low double.

14709

return DAG.getNode(

14710

X86ISD::MOVSD, DL, MVT::v2f64, V2,

14711

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

14712

14713

if (Subtarget.hasSSE41())

14714

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

14715

Zeroable, Subtarget, DAG))

14716

return Blend;

14717

14718

// Use dedicated unpack instructions for masks that match their pattern.

14719

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

14720

return V;

14721

14722

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

14723

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

14724

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

14725

}

14726

14727

/// Handle lowering of 2-lane 64-bit integer shuffles.

14728

///

14729

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

14730

/// the integer unit to minimize domain crossing penalties. However, for blends

14731

/// it falls back to the floating point shuffle operation with appropriate bit

14732

/// casting.

14733

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

14734

const APInt &Zeroable, SDValue V1, SDValue V2,

14735

const X86Subtarget &Subtarget,

14736

SelectionDAG &DAG) {

14737

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14737, __extension__
__PRETTY_FUNCTION__));

14738

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14738, __extension__
__PRETTY_FUNCTION__));

14739

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14739, __extension__
__PRETTY_FUNCTION__));

14740

14741

if (V2.isUndef()) {

14742

// Check for being able to broadcast a single element.

14743

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

14744

Mask, Subtarget, DAG))

14745

return Broadcast;

14746

14747

// Straight shuffle of a single input vector. For everything from SSE2

14748

// onward this has a single fast instruction with no scary immediates.

14749

// We have to map the mask as it is actually a v4i32 shuffle instruction.

14750

V1 = DAG.getBitcast(MVT::v4i32, V1);

14751

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

14752

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

14753

Mask[1] < 0 ? -1 : (Mask[1] * 2),

14754

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

14755

return DAG.getBitcast(

14756

MVT::v2i64,

14757

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

14758

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

14759

}

14760

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14760, __extension__
__PRETTY_FUNCTION__));

14761

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14761, __extension__
__PRETTY_FUNCTION__));

14762

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14762, __extension__
__PRETTY_FUNCTION__));

14763

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14763, __extension__
__PRETTY_FUNCTION__));

14764

14765

if (Subtarget.hasAVX2())

14766

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

14767

return Extract;

14768

14769

// Try to use shift instructions.

14770

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,

14771

Zeroable, Subtarget, DAG))

14772

return Shift;

14773

14774

// When loading a scalar and then shuffling it into a vector we can often do

14775

// the insertion cheaply.

14776

if (SDValue Insertion = lowerShuffleAsElementInsertion(

14777

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

14778

return Insertion;

14779

// Try inverting the insertion since for v2 masks it is easy to do and we

14780

// can't reliably sort the mask one way or the other.

14781

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

14782

if (SDValue Insertion = lowerShuffleAsElementInsertion(

14783

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

14784

return Insertion;

14785

14786

// We have different paths for blend lowering, but they all must use the

14787

// *exact* same predicate.

14788

bool IsBlendSupported = Subtarget.hasSSE41();

14789

if (IsBlendSupported)

14790

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

14791

Zeroable, Subtarget, DAG))

14792

return Blend;

14793

14794

// Use dedicated unpack instructions for masks that match their pattern.

14795

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

14796

return V;

14797

14798

// Try to use byte rotation instructions.

14799

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

14800

if (Subtarget.hasSSSE3()) {

14801

if (Subtarget.hasVLX())

14802

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

14803

Subtarget, DAG))

14804

return Rotate;

14805

14806

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

14807

Subtarget, DAG))

14808

return Rotate;

14809

}

14810

14811

// If we have direct support for blends, we should lower by decomposing into

14812

// a permute. That will be faster than the domain cross.

14813

if (IsBlendSupported)

14814

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

14815

Subtarget, DAG);

14816

14817

// We implement this with SHUFPD which is pretty lame because it will likely

14818

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

14819

// However, all the alternatives are still more cycles and newer chips don't

14820

// have this problem. It would be really nice if x86 had better shuffles here.

14821

V1 = DAG.getBitcast(MVT::v2f64, V1);

14822

V2 = DAG.getBitcast(MVT::v2f64, V2);

14823

return DAG.getBitcast(MVT::v2i64,

14824

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

14825

}

14826

14827

/// Lower a vector shuffle using the SHUFPS instruction.

14828

///

14829

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

14830

/// It makes no assumptions about whether this is the *best* lowering, it simply

14831

/// uses it.

14832

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

14833

ArrayRef<int> Mask, SDValue V1,

14834

SDValue V2, SelectionDAG &DAG) {

14835

SDValue LowV = V1, HighV = V2;

14836

SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());

14837

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

14838

14839

if (NumV2Elements == 1) {

14840

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

14841

14842

// Compute the index adjacent to V2Index and in the same half by toggling

14843

// the low bit.

14844

int V2AdjIndex = V2Index ^ 1;

14845

14846

if (Mask[V2AdjIndex] < 0) {

14847

// Handles all the cases where we have a single V2 element and an undef.

14848

// This will only ever happen in the high lanes because we commute the

14849

// vector otherwise.

14850

if (V2Index < 2)

14851

std::swap(LowV, HighV);

14852

NewMask[V2Index] -= 4;

14853

} else {

14854

// Handle the case where the V2 element ends up adjacent to a V1 element.

14855

// To make this work, blend them together as the first step.

14856

int V1Index = V2AdjIndex;

14857

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

14858

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

14859

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

14860

14861

// Now proceed to reconstruct the final blend as we have the necessary

14862

// high or low half formed.

14863

if (V2Index < 2) {

14864

LowV = V2;

14865

HighV = V1;

14866

} else {

14867

HighV = V2;

14868

}

14869

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

14870

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

14871

}

14872

} else if (NumV2Elements == 2) {

14873

if (Mask[0] < 4 && Mask[1] < 4) {

14874

// Handle the easy case where we have V1 in the low lanes and V2 in the

14875

// high lanes.

14876

NewMask[2] -= 4;

14877

NewMask[3] -= 4;

14878

} else if (Mask[2] < 4 && Mask[3] < 4) {

14879

// We also handle the reversed case because this utility may get called

14880

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

14881

// arrange things in the right direction.

14882

NewMask[0] -= 4;

14883

NewMask[1] -= 4;

14884

HighV = V1;

14885

LowV = V2;

14886

} else {

14887

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

14888

// trying to place elements directly, just blend them and set up the final

14889

// shuffle to place them.

14890

14891

// The first two blend mask elements are for V1, the second two are for

14892

// V2.

14893

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

14894

Mask[2] < 4 ? Mask[2] : Mask[3],

14895

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

14896

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

14897

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

14898

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

14899

14900

// Now we do a normal shuffle of V1 by giving V1 as both operands to

14901

// a blend.

14902

LowV = HighV = V1;

14903

NewMask[0] = Mask[0] < 4 ? 0 : 2;

14904

NewMask[1] = Mask[0] < 4 ? 2 : 0;

14905

NewMask[2] = Mask[2] < 4 ? 1 : 3;

14906

NewMask[3] = Mask[2] < 4 ? 3 : 1;

14907

}

14908

} else if (NumV2Elements == 3) {

14909

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

14910

// we can get here due to other paths (e.g repeated mask matching) that we

14911

// don't want to do another round of lowerVECTOR_SHUFFLE.

14912

ShuffleVectorSDNode::commuteMask(NewMask);

14913

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

14914

}

14915

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

14916

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

14917

}

14918

14919

/// Lower 4-lane 32-bit floating point shuffles.

14920

///

14921

/// Uses instructions exclusively from the floating point unit to minimize

14922

/// domain crossing penalties, as these are sufficient to implement all v4f32

14923

/// shuffles.

14924

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

14925

const APInt &Zeroable, SDValue V1, SDValue V2,

14926

const X86Subtarget &Subtarget,

14927

SelectionDAG &DAG) {

14928

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14928, __extension__
__PRETTY_FUNCTION__));

14929

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14929, __extension__
__PRETTY_FUNCTION__));

14930

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14930, __extension__
__PRETTY_FUNCTION__));

14931

14932

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

14933

14934

if (NumV2Elements == 0) {

14935

// Check for being able to broadcast a single element.

14936

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

14937

Mask, Subtarget, DAG))

14938

return Broadcast;

14939

14940

// Use even/odd duplicate instructions for masks that match their pattern.

14941

if (Subtarget.hasSSE3()) {

14942

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

14943

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

14944

if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

14945

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

14946

}

14947

14948

if (Subtarget.hasAVX()) {

14949

// If we have AVX, we can use VPERMILPS which will allow folding a load

14950

// into the shuffle.

14951

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

14952

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

14953

}

14954

14955

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

14956

// in SSE1 because otherwise they are widened to v2f64 and never get here.

14957

if (!Subtarget.hasSSE2()) {

14958

if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

14959

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

14960

if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

14961

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

14962

}

14963

14964

// Otherwise, use a straight shuffle of a single input vector. We pass the

14965

// input vector to both operands to simulate this with a SHUFPS.

14966

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

14967

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

14968

}

14969

14970

if (Subtarget.hasAVX2())

14971

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

14972

return Extract;

14973

14974

// There are special ways we can lower some single-element blends. However, we

14975

// have custom ways we can lower more complex single-element blends below that

14976

// we defer to if both this and BLENDPS fail to match, so restrict this to

14977

// when the V2 input is targeting element 0 of the mask -- that is the fast

14978

// case here.

14979

if (NumV2Elements == 1 && Mask[0] >= 4)

14980

if (SDValue V = lowerShuffleAsElementInsertion(

14981

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

14982

return V;

14983

14984

if (Subtarget.hasSSE41()) {

14985

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

14986

Zeroable, Subtarget, DAG))

14987

return Blend;

14988

14989

// Use INSERTPS if we can complete the shuffle efficiently.

14990

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

14991

return V;

14992

14993

if (!isSingleSHUFPSMask(Mask))

14994

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

14995

V2, Mask, DAG))

14996

return BlendPerm;

14997

}

14998

14999

// Use low/high mov instructions. These are only valid in SSE1 because

15000

// otherwise they are widened to v2f64 and never get here.

15001

if (!Subtarget.hasSSE2()) {

15002

if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

15003

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

15004

if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

15005

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

15006

}

15007

15008

// Use dedicated unpack instructions for masks that match their pattern.

15009

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

15010

return V;

15011

15012

// Otherwise fall back to a SHUFPS lowering strategy.

15013

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

15014

}

15015

15016

/// Lower 4-lane i32 vector shuffles.

15017

///

15018

/// We try to handle these with integer-domain shuffles where we can, but for

15019

/// blends we use the floating point domain blend instructions.

15020

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15021

const APInt &Zeroable, SDValue V1, SDValue V2,

15022

const X86Subtarget &Subtarget,

15023

SelectionDAG &DAG) {

15024

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15024, __extension__
__PRETTY_FUNCTION__));

15025

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15025, __extension__
__PRETTY_FUNCTION__));

15026

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15026, __extension__
__PRETTY_FUNCTION__));

15027

15028

// Whenever we can lower this as a zext, that instruction is strictly faster

15029

// than any alternative. It also allows us to fold memory operands into the

15030

// shuffle in many cases.

15031

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

15032

Zeroable, Subtarget, DAG))

15033

return ZExt;

15034

15035

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15036

15037

if (NumV2Elements == 0) {

15038

// Try to use broadcast unless the mask only has one non-undef element.

15039

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

15040

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

15041

Mask, Subtarget, DAG))

15042

return Broadcast;

15043

}

15044

15045

// Straight shuffle of a single input vector. For everything from SSE2

15046

// onward this has a single fast instruction with no scary immediates.

15047

// We coerce the shuffle pattern to be compatible with UNPCK instructions

15048

// but we aren't actually going to use the UNPCK instruction because doing

15049

// so prevents folding a load into this instruction or making a copy.

15050

const int UnpackLoMask[] = {0, 0, 1, 1};

15051

const int UnpackHiMask[] = {2, 2, 3, 3};

15052

if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

15053

Mask = UnpackLoMask;

15054

else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

15055

Mask = UnpackHiMask;

15056

15057

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15058

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15059

}

15060

15061

if (Subtarget.hasAVX2())

15062

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15063

return Extract;

15064

15065

// Try to use shift instructions.

15066

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,

15067

Zeroable, Subtarget, DAG))

15068

return Shift;

15069

15070

// There are special ways we can lower some single-element blends.

15071

if (NumV2Elements == 1)

15072

if (SDValue V = lowerShuffleAsElementInsertion(

15073

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15074

return V;

15075

15076

// We have different paths for blend lowering, but they all must use the

15077

// *exact* same predicate.

15078

bool IsBlendSupported = Subtarget.hasSSE41();

15079

if (IsBlendSupported)

15080

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

15081

Zeroable, Subtarget, DAG))

15082

return Blend;

15083

15084

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

15085

Zeroable, Subtarget, DAG))

15086

return Masked;

15087

15088

// Use dedicated unpack instructions for masks that match their pattern.

15089

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

15090

return V;

15091

15092

// Try to use byte rotation instructions.

15093

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15094

if (Subtarget.hasSSSE3()) {

15095

if (Subtarget.hasVLX())

15096

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

15097

Subtarget, DAG))

15098

return Rotate;

15099

15100

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

15101

Subtarget, DAG))

15102

return Rotate;

15103

}

15104

15105

// Assume that a single SHUFPS is faster than an alternative sequence of

15106

// multiple instructions (even if the CPU has a domain penalty).

15107

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

15108

if (!isSingleSHUFPSMask(Mask)) {

15109

// If we have direct support for blends, we should lower by decomposing into

15110

// a permute. That will be faster than the domain cross.

15111

if (IsBlendSupported)

15112

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

15113

Subtarget, DAG);

15114

15115

// Try to lower by permuting the inputs into an unpack instruction.

15116

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

15117

Mask, Subtarget, DAG))

15118

return Unpack;

15119

}

15120

15121

// We implement this with SHUFPS because it can blend from two vectors.

15122

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

15123

// up the inputs, bypassing domain shift penalties that we would incur if we

15124

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

15125

// relevant.

15126

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

15127

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

15128

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

15129

return DAG.getBitcast(MVT::v4i32, ShufPS);

15130

}

15131

15132

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

15133

/// shuffle lowering, and the most complex part.

15134

///

15135

/// The lowering strategy is to try to form pairs of input lanes which are

15136

/// targeted at the same half of the final vector, and then use a dword shuffle

15137

/// to place them onto the right half, and finally unpack the paired lanes into

15138

/// their final position.

15139

///

15140

/// The exact breakdown of how to form these dword pairs and align them on the

15141

/// correct sides is really tricky. See the comments within the function for

15142

/// more of the details.

15143

///

15144

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

15145

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

15146

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

15147

/// vector, form the analogous 128-bit 8-element Mask.

15148

static SDValue lowerV8I16GeneralSingleInputShuffle(

15149

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

15150

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15151

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15151, __extension__
__PRETTY_FUNCTION__));

15152

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

15153

15154

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15154, __extension__
__PRETTY_FUNCTION__));

15155

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

15156

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

15157

15158

// Attempt to directly match PSHUFLW or PSHUFHW.

15159

if (isUndefOrInRange(LoMask, 0, 4) &&

15160

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

15161

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15162

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15163

}

15164

if (isUndefOrInRange(HiMask, 4, 8) &&

15165

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

15166

for (int i = 0; i != 4; ++i)

15167

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

15168

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15169

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15170

}

15171

15172

SmallVector<int, 4> LoInputs;

15173

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

15174

array_pod_sort(LoInputs.begin(), LoInputs.end());

15175

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

15176

SmallVector<int, 4> HiInputs;

15177

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

15178

array_pod_sort(HiInputs.begin(), HiInputs.end());

15179

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

15180

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

15181

int NumHToL = LoInputs.size() - NumLToL;

15182

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

15183

int NumHToH = HiInputs.size() - NumLToH;

15184

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

15185

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

15186

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

15187

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

15188

15189

// If we are shuffling values from one half - check how many different DWORD

15190

// pairs we need to create. If only 1 or 2 then we can perform this as a

15191

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

15192

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

15193

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

15194

V = DAG.getNode(ShufWOp, DL, VT, V,

15195

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15196

V = DAG.getBitcast(PSHUFDVT, V);

15197

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

15198

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

15199

return DAG.getBitcast(VT, V);

15200

};

15201

15202

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

15203

int PSHUFDMask[4] = { -1, -1, -1, -1 };

15204

SmallVector<std::pair<int, int>, 4> DWordPairs;

15205

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

15206

15207

// Collect the different DWORD pairs.

15208

for (int DWord = 0; DWord != 4; ++DWord) {

15209

int M0 = Mask[2 * DWord + 0];

15210

int M1 = Mask[2 * DWord + 1];

15211

M0 = (M0 >= 0 ? M0 % 4 : M0);

15212

M1 = (M1 >= 0 ? M1 % 4 : M1);

15213

if (M0 < 0 && M1 < 0)

15214

continue;

15215

15216

bool Match = false;

15217

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

15218

auto &DWordPair = DWordPairs[j];

15219

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

15220

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

15221

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

15222

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

15223

PSHUFDMask[DWord] = DOffset + j;

15224

Match = true;

15225

break;

15226

}

15227

}

15228

if (!Match) {

15229

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

15230

DWordPairs.push_back(std::make_pair(M0, M1));

15231

}

15232

}

15233

15234

if (DWordPairs.size() <= 2) {

15235

DWordPairs.resize(2, std::make_pair(-1, -1));

15236

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

15237

DWordPairs[1].first, DWordPairs[1].second};

15238

if ((NumHToL + NumHToH) == 0)

15239

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

15240

if ((NumLToL + NumLToH) == 0)

15241

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

15242

}

15243

}

15244

15245

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

15246

// such inputs we can swap two of the dwords across the half mark and end up

15247

// with <=2 inputs to each half in each half. Once there, we can fall through

15248

// to the generic code below. For example:

15249

//

15250

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15251

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

15252

//

15253

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

15254

// and an existing 2-into-2 on the other half. In this case we may have to

15255

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

15256

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

15257

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

15258

// because any other situation (including a 3-into-1 or 1-into-3 in the other

15259

// half than the one we target for fixing) will be fixed when we re-enter this

15260

// path. We will also combine away any sequence of PSHUFD instructions that

15261

// result into a single instruction. Here is an example of the tricky case:

15262

//

15263

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15264

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

15265

//

15266

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

15267

//

15268

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

15269

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

15270

//

15271

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

15272

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

15273

//

15274

// The result is fine to be handled by the generic logic.

15275

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

15276

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

15277

int AOffset, int BOffset) {

15278

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__))

15279

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__));

15280

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__))

15281

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__));

15282

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15283, __extension__
__PRETTY_FUNCTION__))

15283

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15283, __extension__
__PRETTY_FUNCTION__));

15284

15285

bool ThreeAInputs = AToAInputs.size() == 3;

15286

15287

// Compute the index of dword with only one word among the three inputs in

15288

// a half by taking the sum of the half with three inputs and subtracting

15289

// the sum of the actual three inputs. The difference is the remaining

15290

// slot.

15291

int ADWord = 0, BDWord = 0;

15292

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

15293

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

15294

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

15295

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

15296

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

15297

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

15298

int TripleNonInputIdx =

15299

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

15300

TripleDWord = TripleNonInputIdx / 2;

15301

15302

// We use xor with one to compute the adjacent DWord to whichever one the

15303

// OneInput is in.

15304

OneInputDWord = (OneInput / 2) ^ 1;

15305

15306

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

15307

// and BToA inputs. If there is also such a problem with the BToB and AToB

15308

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

15309

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

15310

// is essential that we don't *create* a 3<-1 as then we might oscillate.

15311

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

15312

// Compute how many inputs will be flipped by swapping these DWords. We

15313

// need

15314

// to balance this to ensure we don't form a 3-1 shuffle in the other

15315

// half.

15316

int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

15317

llvm::count(AToBInputs, 2 * ADWord + 1);

15318

int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

15319

llvm::count(BToBInputs, 2 * BDWord + 1);

15320

if ((NumFlippedAToBInputs == 1 &&

15321

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

15322

(NumFlippedBToBInputs == 1 &&

15323

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

15324

// We choose whether to fix the A half or B half based on whether that

15325

// half has zero flipped inputs. At zero, we may not be able to fix it

15326

// with that half. We also bias towards fixing the B half because that

15327

// will more commonly be the high half, and we have to bias one way.

15328

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

15329

ArrayRef<int> Inputs) {

15330

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

15331

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

15332

// Determine whether the free index is in the flipped dword or the

15333

// unflipped dword based on where the pinned index is. We use this bit

15334

// in an xor to conditionally select the adjacent dword.

15335

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

15336

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15337

if (IsFixIdxInput == IsFixFreeIdxInput)

15338

FixFreeIdx += 1;

15339

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15340

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15341, __extension__
__PRETTY_FUNCTION__))

15341

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15341, __extension__
__PRETTY_FUNCTION__));

15342

int PSHUFHalfMask[] = {0, 1, 2, 3};

15343

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

15344

V = DAG.getNode(

15345

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

15346

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

15347

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15348

15349

for (int &M : Mask)

15350

if (M >= 0 && M == FixIdx)

15351

M = FixFreeIdx;

15352

else if (M >= 0 && M == FixFreeIdx)

15353

M = FixIdx;

15354

};

15355

if (NumFlippedBToBInputs != 0) {

15356

int BPinnedIdx =

15357

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

15358

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

15359

} else {

15360

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15360, __extension__
__PRETTY_FUNCTION__));

15361

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

15362

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

15363

}

15364

}

15365

}

15366

15367

int PSHUFDMask[] = {0, 1, 2, 3};

15368

PSHUFDMask[ADWord] = BDWord;

15369

PSHUFDMask[BDWord] = ADWord;

15370

V = DAG.getBitcast(

15371

VT,

15372

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

15373

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

15374

15375

// Adjust the mask to match the new locations of A and B.

15376

for (int &M : Mask)

15377

if (M >= 0 && M/2 == ADWord)

15378

M = 2 * BDWord + M % 2;

15379

else if (M >= 0 && M/2 == BDWord)

15380

M = 2 * ADWord + M % 2;

15381

15382

// Recurse back into this routine to re-compute state now that this isn't

15383

// a 3 and 1 problem.

15384

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

15385

};

15386

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

15387

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

15388

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

15389

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

15390

15391

// At this point there are at most two inputs to the low and high halves from

15392

// each half. That means the inputs can always be grouped into dwords and

15393

// those dwords can then be moved to the correct half with a dword shuffle.

15394

// We use at most one low and one high word shuffle to collect these paired

15395

// inputs into dwords, and finally a dword shuffle to place them.

15396

int PSHUFLMask[4] = {-1, -1, -1, -1};

15397

int PSHUFHMask[4] = {-1, -1, -1, -1};

15398

int PSHUFDMask[4] = {-1, -1, -1, -1};

15399

15400

// First fix the masks for all the inputs that are staying in their

15401

// original halves. This will then dictate the targets of the cross-half

15402

// shuffles.

15403

auto fixInPlaceInputs =

15404

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

15405

MutableArrayRef<int> SourceHalfMask,

15406

MutableArrayRef<int> HalfMask, int HalfOffset) {

15407

if (InPlaceInputs.empty())

15408

return;

15409

if (InPlaceInputs.size() == 1) {

15410

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15411

InPlaceInputs[0] - HalfOffset;

15412

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

15413

return;

15414

}

15415

if (IncomingInputs.empty()) {

15416

// Just fix all of the in place inputs.

15417

for (int Input : InPlaceInputs) {

15418

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

15419

PSHUFDMask[Input / 2] = Input / 2;

15420

}

15421

return;

15422

}

15423

15424

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15424, __extension__
__PRETTY_FUNCTION__));

15425

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15426

InPlaceInputs[0] - HalfOffset;

15427

// Put the second input next to the first so that they are packed into

15428

// a dword. We find the adjacent index by toggling the low bit.

15429

int AdjIndex = InPlaceInputs[0] ^ 1;

15430

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

15431

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

15432

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

15433

};

15434

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

15435

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

15436

15437

// Now gather the cross-half inputs and place them into a free dword of

15438

// their target half.

15439

// FIXME: This operation could almost certainly be simplified dramatically to

15440

// look more like the 3-1 fixing operation.

15441

auto moveInputsToRightHalf = [&PSHUFDMask](

15442

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

15443

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

15444

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

15445

int DestOffset) {

15446

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

15447

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

15448

};

15449

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

15450

int Word) {

15451

int LowWord = Word & ~1;

15452

int HighWord = Word | 1;

15453

return isWordClobbered(SourceHalfMask, LowWord) ||

15454

isWordClobbered(SourceHalfMask, HighWord);

15455

};

15456

15457

if (IncomingInputs.empty())

15458

return;

15459

15460

if (ExistingInputs.empty()) {

15461

// Map any dwords with inputs from them into the right half.

15462

for (int Input : IncomingInputs) {

15463

// If the source half mask maps over the inputs, turn those into

15464

// swaps and use the swapped lane.

15465

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

15466

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

15467

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

15468

Input - SourceOffset;

15469

// We have to swap the uses in our half mask in one sweep.

15470

for (int &M : HalfMask)

15471

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

15472

M = Input;

15473

else if (M == Input)

15474

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

15475

} else {

15476

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))

15477

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))

15478

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__));

15479

}

15480

// Note that this correctly re-maps both when we do a swap and when

15481

// we observe the other side of the swap above. We rely on that to

15482

// avoid swapping the members of the input list directly.

15483

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

15484

}

15485

15486

// Map the input's dword into the correct half.

15487

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

15488

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

15489

else

15490

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))

15491

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__))

15492

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15492, __extension__
__PRETTY_FUNCTION__));

15493

}

15494

15495

// And just directly shift any other-half mask elements to be same-half

15496

// as we will have mirrored the dword containing the element into the

15497

// same position within that half.

15498

for (int &M : HalfMask)

15499

if (M >= SourceOffset && M < SourceOffset + 4) {

15500

M = M - SourceOffset + DestOffset;

15501

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15501, __extension__
__PRETTY_FUNCTION__));

15502

}

15503

return;

15504

}

15505

15506

// Ensure we have the input in a viable dword of its current half. This

15507

// is particularly tricky because the original position may be clobbered

15508

// by inputs being moved and *staying* in that half.

15509

if (IncomingInputs.size() == 1) {

15510

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

15511

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

15512

SourceOffset;

15513

SourceHalfMask[InputFixed - SourceOffset] =

15514

IncomingInputs[0] - SourceOffset;

15515

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

15516

InputFixed);

15517

IncomingInputs[0] = InputFixed;

15518

}

15519

} else if (IncomingInputs.size() == 2) {

15520

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

15521

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

15522

// We have two non-adjacent or clobbered inputs we need to extract from

15523

// the source half. To do this, we need to map them into some adjacent

15524

// dword slot in the source mask.

15525

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

15526

IncomingInputs[1] - SourceOffset};

15527

15528

// If there is a free slot in the source half mask adjacent to one of

15529

// the inputs, place the other input in it. We use (Index XOR 1) to

15530

// compute an adjacent index.

15531

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

15532

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

15533

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

15534

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

15535

InputsFixed[1] = InputsFixed[0] ^ 1;

15536

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

15537

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

15538

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

15539

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

15540

InputsFixed[0] = InputsFixed[1] ^ 1;

15541

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

15542

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

15543

// The two inputs are in the same DWord but it is clobbered and the

15544

// adjacent DWord isn't used at all. Move both inputs to the free

15545

// slot.

15546

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

15547

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

15548

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

15549

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

15550

} else {

15551

// The only way we hit this point is if there is no clobbering

15552

// (because there are no off-half inputs to this half) and there is no

15553

// free slot adjacent to one of the inputs. In this case, we have to

15554

// swap an input with a non-input.

15555

for (int i = 0; i < 4; ++i)

15556

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15557, __extension__
__PRETTY_FUNCTION__))

15557

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15557, __extension__
__PRETTY_FUNCTION__));

15558

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15559, __extension__
__PRETTY_FUNCTION__))

15559

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15559, __extension__
__PRETTY_FUNCTION__));

15560

15561

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

15562

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

15563

15564

// We also have to update the final source mask in this case because

15565

// it may need to undo the above swap.

15566

for (int &M : FinalSourceHalfMask)

15567

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

15568

M = InputsFixed[1] + SourceOffset;

15569

else if (M == InputsFixed[1] + SourceOffset)

15570

M = (InputsFixed[0] ^ 1) + SourceOffset;

15571

15572

InputsFixed[1] = InputsFixed[0] ^ 1;

15573

}

15574

15575

// Point everything at the fixed inputs.

15576

for (int &M : HalfMask)

15577

if (M == IncomingInputs[0])

15578

M = InputsFixed[0] + SourceOffset;

15579

else if (M == IncomingInputs[1])

15580

M = InputsFixed[1] + SourceOffset;

15581

15582

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

15583

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

15584

}

15585

} else {

15586

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15586);

15587

}

15588

15589

// Now hoist the DWord down to the right half.

15590

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

15591

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15591, __extension__
__PRETTY_FUNCTION__));

15592

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

15593

for (int &M : HalfMask)

15594

for (int Input : IncomingInputs)

15595

if (M == Input)

15596

M = FreeDWord * 2 + Input % 2;

15597

};

15598

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

15599

/*SourceOffset*/ 4, /*DestOffset*/ 0);

15600

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

15601

/*SourceOffset*/ 0, /*DestOffset*/ 4);

15602

15603

// Now enact all the shuffles we've computed to move the inputs into their

15604

// target half.

15605

if (!isNoopShuffleMask(PSHUFLMask))

15606

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15607

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

15608

if (!isNoopShuffleMask(PSHUFHMask))

15609

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15610

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

15611

if (!isNoopShuffleMask(PSHUFDMask))

15612

V = DAG.getBitcast(

15613

VT,

15614

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

15615

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

15616

15617

// At this point, each half should contain all its inputs, and we can then

15618

// just shuffle them into their final position.

15619

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15620, __extension__
__PRETTY_FUNCTION__))

15620

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15620, __extension__
__PRETTY_FUNCTION__));

15621

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15622, __extension__
__PRETTY_FUNCTION__))

15622

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15622, __extension__
__PRETTY_FUNCTION__));

15623

15624

// Do a half shuffle for the low mask.

15625

if (!isNoopShuffleMask(LoMask))

15626

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15627

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15628

15629

// Do a half shuffle with the high mask after shifting its values down.

15630

for (int &M : HiMask)

15631

if (M >= 0)

15632

M -= 4;

15633

if (!isNoopShuffleMask(HiMask))

15634

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15635

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15636

15637

return V;

15638

}

15639

15640

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

15641

/// blend if only one input is used.

15642

static SDValue lowerShuffleAsBlendOfPSHUFBs(

15643

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

15644

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

15645

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15646, __extension__
__PRETTY_FUNCTION__))

15646

"Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15646, __extension__
__PRETTY_FUNCTION__));

15647

15648

int NumBytes = VT.getSizeInBits() / 8;

15649

int Size = Mask.size();

15650

int Scale = NumBytes / Size;

15651

15652

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

15653

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

15654

V1InUse = false;

15655

V2InUse = false;

15656

15657

for (int i = 0; i < NumBytes; ++i) {

15658

int M = Mask[i / Scale];

15659

if (M < 0)

15660

continue;

15661

15662

const int ZeroMask = 0x80;

15663

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

15664

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

15665

if (Zeroable[i / Scale])

15666

V1Idx = V2Idx = ZeroMask;

15667

15668

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

15669

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

15670

V1InUse |= (ZeroMask != V1Idx);

15671

V2InUse |= (ZeroMask != V2Idx);

15672

}

15673

15674

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

15675

if (V1InUse)

15676

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

15677

DAG.getBuildVector(ShufVT, DL, V1Mask));

15678

if (V2InUse)

15679

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

15680

DAG.getBuildVector(ShufVT, DL, V2Mask));

15681

15682

// If we need shuffled inputs from both, blend the two.

15683

SDValue V;

15684

if (V1InUse && V2InUse)

15685

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

15686

else

15687

V = V1InUse ? V1 : V2;

15688

15689

// Cast the result back to the correct type.

15690

return DAG.getBitcast(VT, V);

15691

}

15692

15693

/// Generic lowering of 8-lane i16 shuffles.

15694

///

15695

/// This handles both single-input shuffles and combined shuffle/blends with

15696

/// two inputs. The single input shuffles are immediately delegated to

15697

/// a dedicated lowering routine.

15698

///

15699

/// The blends are lowered in one of three fundamental ways. If there are few

15700

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

15701

/// of the input is significantly cheaper when lowered as an interleaving of

15702

/// the two inputs, try to interleave them. Otherwise, blend the low and high

15703

/// halves of the inputs separately (making them have relatively few inputs)

15704

/// and then concatenate them.

15705

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15706

const APInt &Zeroable, SDValue V1, SDValue V2,

15707

const X86Subtarget &Subtarget,

15708

SelectionDAG &DAG) {

15709

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15709, __extension__
__PRETTY_FUNCTION__));

15710

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15710, __extension__
__PRETTY_FUNCTION__));

15711

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15711, __extension__
__PRETTY_FUNCTION__));

15712

15713

// Whenever we can lower this as a zext, that instruction is strictly faster

15714

// than any alternative.

15715

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

15716

Zeroable, Subtarget, DAG))

15717

return ZExt;

15718

15719

// Try to use lower using a truncation.

15720

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

15721

Subtarget, DAG))

15722

return V;

15723

15724

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

15725

15726

if (NumV2Inputs == 0) {

15727

// Try to use shift instructions.

15728

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,

15729

Zeroable, Subtarget, DAG))

15730

return Shift;

15731

15732

// Check for being able to broadcast a single element.

15733

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

15734

Mask, Subtarget, DAG))

15735

return Broadcast;

15736

15737

// Try to use bit rotation instructions.

15738

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

15739

Subtarget, DAG))

15740

return Rotate;

15741

15742

// Use dedicated unpack instructions for masks that match their pattern.

15743

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

15744

return V;

15745

15746

// Use dedicated pack instructions for masks that match their pattern.

15747

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

15748

Subtarget))

15749

return V;

15750

15751

// Try to use byte rotation instructions.

15752

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

15753

Subtarget, DAG))

15754

return Rotate;

15755

15756

// Make a copy of the mask so it can be modified.

15757

SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());

15758

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

15759

Subtarget, DAG);

15760

}

15761

15762

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))

15763

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__))

15764

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15764, __extension__
__PRETTY_FUNCTION__));

15765

15766

// Try to use shift instructions.

15767

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,

15768

Zeroable, Subtarget, DAG))

15769

return Shift;

15770

15771

// See if we can use SSE4A Extraction / Insertion.

15772

if (Subtarget.hasSSE4A())

15773

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

15774

Zeroable, DAG))

15775

return V;

15776

15777

// There are special ways we can lower some single-element blends.

15778

if (NumV2Inputs == 1)

15779

if (SDValue V = lowerShuffleAsElementInsertion(

15780

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

15781

return V;

15782

15783

// We have different paths for blend lowering, but they all must use the

15784

// *exact* same predicate.

15785

bool IsBlendSupported = Subtarget.hasSSE41();

15786

if (IsBlendSupported)

15787

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

15788

Zeroable, Subtarget, DAG))

15789

return Blend;

15790

15791

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

15792

Zeroable, Subtarget, DAG))

15793

return Masked;

15794

15795

// Use dedicated unpack instructions for masks that match their pattern.

15796

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

15797

return V;

15798

15799

// Use dedicated pack instructions for masks that match their pattern.

15800

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

15801

Subtarget))

15802

return V;

15803

15804

// Try to use lower using a truncation.

15805

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

15806

Subtarget, DAG))

15807

return V;

15808

15809

// Try to use byte rotation instructions.

15810

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

15811

Subtarget, DAG))

15812

return Rotate;

15813

15814

if (SDValue BitBlend =

15815

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

15816

return BitBlend;

15817

15818

// Try to use byte shift instructions to mask.

15819

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

15820

Zeroable, Subtarget, DAG))

15821

return V;

15822

15823

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

15824

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

15825

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

15826

int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

15827

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

15828

!Subtarget.hasVLX()) {

15829

// Check if this is part of a 256-bit vector truncation.

15830

if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

15831

peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

15832

peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

15833

SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

15834

V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

15835

getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

15836

DAG.getTargetConstant(0xEE, DL, MVT::i8));

15837

V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

15838

V1 = extract128BitVector(V1V2, 0, DAG, DL);

15839

V2 = extract128BitVector(V1V2, 4, DAG, DL);

15840

} else {

15841

SmallVector<SDValue, 4> DWordClearOps(4,

15842

DAG.getConstant(0, DL, MVT::i32));

15843

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

15844

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

15845

SDValue DWordClearMask =

15846

DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

15847

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

15848

DWordClearMask);

15849

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

15850

DWordClearMask);

15851

}

15852

// Now pack things back together.

15853

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

15854

if (NumEvenDrops == 2) {

15855

Result = DAG.getBitcast(MVT::v4i32, Result);

15856

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

15857

}

15858

return Result;

15859

}

15860

15861

// When compacting odd (upper) elements, use PACKSS pre-SSE41.

15862

int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

15863

if (NumOddDrops == 1) {

15864

bool HasSSE41 = Subtarget.hasSSE41();

15865

V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

15866

DAG.getBitcast(MVT::v4i32, V1),

15867

DAG.getTargetConstant(16, DL, MVT::i8));

15868

V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

15869

DAG.getBitcast(MVT::v4i32, V2),

15870

DAG.getTargetConstant(16, DL, MVT::i8));

15871

return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

15872

MVT::v8i16, V1, V2);

15873

}

15874

15875

// Try to lower by permuting the inputs into an unpack instruction.

15876

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

15877

Mask, Subtarget, DAG))

15878

return Unpack;

15879

15880

// If we can't directly blend but can use PSHUFB, that will be better as it

15881

// can both shuffle and set up the inefficient blend.

15882

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

15883

bool V1InUse, V2InUse;

15884

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

15885

Zeroable, DAG, V1InUse, V2InUse);

15886

}

15887

15888

// We can always bit-blend if we have to so the fallback strategy is to

15889

// decompose into single-input permutes and blends/unpacks.

15890

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

15891

Mask, Subtarget, DAG);

15892

}

15893

15894

/// Lower 8-lane 16-bit floating point shuffles.

15895

static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15896

const APInt &Zeroable, SDValue V1, SDValue V2,

15897

const X86Subtarget &Subtarget,

15898

SelectionDAG &DAG) {

15899

assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15899, __extension__
__PRETTY_FUNCTION__));

15900

assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15900, __extension__
__PRETTY_FUNCTION__));

15901

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15901, __extension__
__PRETTY_FUNCTION__));

15902

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

15903

15904

if (NumV2Elements == 0) {

15905

// Check for being able to broadcast a single element.

15906

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

15907

Mask, Subtarget, DAG))

15908

return Broadcast;

15909

}

15910

if (NumV2Elements == 1 && Mask[0] >= 8)

15911

if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,

15912

Zeroable, Subtarget, DAG))

15913

return V;

15914

15915

V1 = DAG.getBitcast(MVT::v8i16, V1);

15916

V2 = DAG.getBitcast(MVT::v8i16, V2);

15917

return DAG.getBitcast(MVT::v8f16,

15918

DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

15919

}

15920

15921

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

15922

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

15923

// the active subvector is extracted.

15924

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

15925

ArrayRef<int> Mask, SDValue V1, SDValue V2,

15926

const X86Subtarget &Subtarget,

15927

SelectionDAG &DAG) {

15928

MVT MaskVT = VT.changeTypeToInteger();

15929

SDValue MaskNode;

15930

MVT ShuffleVT = VT;

15931

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

15932

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

15933

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

15934

ShuffleVT = V1.getSimpleValueType();

15935

15936

// Adjust mask to correct indices for the second input.

15937

int NumElts = VT.getVectorNumElements();

15938

unsigned Scale = 512 / VT.getSizeInBits();

15939

SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());

15940

for (int &M : AdjustedMask)

15941

if (NumElts <= M)

15942

M += (Scale - 1) * NumElts;

15943

MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

15944

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

15945

} else {

15946

MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

15947

}

15948

15949

SDValue Result;

15950

if (V2.isUndef())

15951

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

15952

else

15953

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

15954

15955

if (VT != ShuffleVT)

15956

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

15957

15958

return Result;

15959

}

15960

15961

/// Generic lowering of v16i8 shuffles.

15962

///

15963

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

15964

/// detect any complexity reducing interleaving. If that doesn't help, it uses

15965

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

15966

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

15967

/// back together.

15968

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15969

const APInt &Zeroable, SDValue V1, SDValue V2,

15970

const X86Subtarget &Subtarget,

15971

SelectionDAG &DAG) {

15972

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15972, __extension__
__PRETTY_FUNCTION__));

15973

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15973, __extension__
__PRETTY_FUNCTION__));

15974

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15974, __extension__
__PRETTY_FUNCTION__));

15975

15976

// Try to use shift instructions.

15977

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,

15978

Zeroable, Subtarget, DAG))

15979

return Shift;

15980

15981

// Try to use byte rotation instructions.

15982

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

15983

Subtarget, DAG))

15984

return Rotate;

15985

15986

// Use dedicated pack instructions for masks that match their pattern.

15987

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

15988

Subtarget))

15989

return V;

15990

15991

// Try to use a zext lowering.

15992

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

15993

Zeroable, Subtarget, DAG))

15994

return ZExt;

15995

15996

// Try to use lower using a truncation.

15997

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

15998

Subtarget, DAG))

15999

return V;

16000

16001

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16002

Subtarget, DAG))

16003

return V;

16004

16005

// See if we can use SSE4A Extraction / Insertion.

16006

if (Subtarget.hasSSE4A())

16007

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

16008

Zeroable, DAG))

16009

return V;

16010

16011

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

16012

16013

// For single-input shuffles, there are some nicer lowering tricks we can use.

16014

if (NumV2Elements == 0) {

16015

// Check for being able to broadcast a single element.

16016

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

16017

Mask, Subtarget, DAG))

16018

return Broadcast;

16019

16020

// Try to use bit rotation instructions.

16021

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

16022

Subtarget, DAG))

16023

return Rotate;

16024

16025

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16026

return V;

16027

16028

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

16029

// Notably, this handles splat and partial-splat shuffles more efficiently.

16030

// However, it only makes sense if the pre-duplication shuffle simplifies

16031

// things significantly. Currently, this means we need to be able to

16032

// express the pre-duplication shuffle as an i16 shuffle.

16033

//

16034

// FIXME: We should check for other patterns which can be widened into an

16035

// i16 shuffle as well.

16036

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

16037

for (int i = 0; i < 16; i += 2)

16038

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

16039

return false;

16040

16041

return true;

16042

};

16043

auto tryToWidenViaDuplication = [&]() -> SDValue {

16044

if (!canWidenViaDuplication(Mask))

16045

return SDValue();

16046

SmallVector<int, 4> LoInputs;

16047

copy_if(Mask, std::back_inserter(LoInputs),

16048

[](int M) { return M >= 0 && M < 8; });

16049

array_pod_sort(LoInputs.begin(), LoInputs.end());

16050

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

16051

LoInputs.end());

16052

SmallVector<int, 4> HiInputs;

16053

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

16054

array_pod_sort(HiInputs.begin(), HiInputs.end());

16055

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

16056

HiInputs.end());

16057

16058

bool TargetLo = LoInputs.size() >= HiInputs.size();

16059

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

16060

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

16061

16062

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

16063

SmallDenseMap<int, int, 8> LaneMap;

16064

for (int I : InPlaceInputs) {

16065

PreDupI16Shuffle[I/2] = I/2;

16066

LaneMap[I] = I;

16067

}

16068

int j = TargetLo ? 0 : 4, je = j + 4;

16069

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

16070

// Check if j is already a shuffle of this input. This happens when

16071

// there are two adjacent bytes after we move the low one.

16072

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

16073

// If we haven't yet mapped the input, search for a slot into which

16074

// we can map it.

16075

while (j < je && PreDupI16Shuffle[j] >= 0)

16076

++j;

16077

16078

if (j == je)

16079

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

16080

return SDValue();

16081

16082

// Map this input with the i16 shuffle.

16083

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

16084

}

16085

16086

// Update the lane map based on the mapping we ended up with.

16087

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

16088

}

16089

V1 = DAG.getBitcast(

16090

MVT::v16i8,

16091

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16092

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

16093

16094

// Unpack the bytes to form the i16s that will be shuffled into place.

16095

bool EvenInUse = false, OddInUse = false;

16096

for (int i = 0; i < 16; i += 2) {

16097

EvenInUse |= (Mask[i + 0] >= 0);

16098

OddInUse |= (Mask[i + 1] >= 0);

16099

if (EvenInUse && OddInUse)

16100

break;

16101

}

16102

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

16103

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

16104

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

16105

16106

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

16107

for (int i = 0; i < 16; ++i)

16108

if (Mask[i] >= 0) {

16109

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

16110

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16110, __extension__
__PRETTY_FUNCTION__));

16111

if (PostDupI16Shuffle[i / 2] < 0)

16112

PostDupI16Shuffle[i / 2] = MappedMask;

16113

else

16114

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16115, __extension__
__PRETTY_FUNCTION__))

16115

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16115, __extension__
__PRETTY_FUNCTION__));

16116

}

16117

return DAG.getBitcast(

16118

MVT::v16i8,

16119

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16120

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

16121

};

16122

if (SDValue V = tryToWidenViaDuplication())

16123

return V;

16124

}

16125

16126

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

16127

Zeroable, Subtarget, DAG))

16128

return Masked;

16129

16130

// Use dedicated unpack instructions for masks that match their pattern.

16131

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16132

return V;

16133

16134

// Try to use byte shift instructions to mask.

16135

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

16136

Zeroable, Subtarget, DAG))

16137

return V;

16138

16139

// Check for compaction patterns.

16140

bool IsSingleInput = V2.isUndef();

16141

int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);

16142

16143

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

16144

// with PSHUFB. It is important to do this before we attempt to generate any

16145

// blends but after all of the single-input lowerings. If the single input

16146

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

16147

// want to preserve that and we can DAG combine any longer sequences into

16148

// a PSHUFB in the end. But once we start blending from multiple inputs,

16149

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

16150

// and there are *very* few patterns that would actually be faster than the

16151

// PSHUFB approach because of its ability to zero lanes.

16152

//

16153

// If the mask is a binary compaction, we can more efficiently perform this

16154

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

16155

//

16156

// FIXME: The only exceptions to the above are blends which are exact

16157

// interleavings with direct instructions supporting them. We currently don't

16158

// handle those well here.

16159

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

16160

bool V1InUse = false;

16161

bool V2InUse = false;

16162

16163

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

16164

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

16165

16166

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

16167

// do so. This avoids using them to handle blends-with-zero which is

16168

// important as a single pshufb is significantly faster for that.

16169

if (V1InUse && V2InUse) {

16170

if (Subtarget.hasSSE41())

16171

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

16172

Zeroable, Subtarget, DAG))

16173

return Blend;

16174

16175

// We can use an unpack to do the blending rather than an or in some

16176

// cases. Even though the or may be (very minorly) more efficient, we

16177

// preference this lowering because there are common cases where part of

16178

// the complexity of the shuffles goes away when we do the final blend as

16179

// an unpack.

16180

// FIXME: It might be worth trying to detect if the unpack-feeding

16181

// shuffles will both be pshufb, in which case we shouldn't bother with

16182

// this.

16183

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

16184

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16185

return Unpack;

16186

16187

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

16188

if (Subtarget.hasVBMI())

16189

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

16190

DAG);

16191

16192

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

16193

if (Subtarget.hasXOP()) {

16194

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

16195

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

16196

}

16197

16198

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

16199

// PALIGNR will be cheaper than the second PSHUFB+OR.

16200

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

16201

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16202

return V;

16203

}

16204

16205

return PSHUFB;

16206

}

16207

16208

// There are special ways we can lower some single-element blends.

16209

if (NumV2Elements == 1)

16210

if (SDValue V = lowerShuffleAsElementInsertion(

16211

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

16212

return V;

16213

16214

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

16215

return Blend;

16216

16217

// Check whether a compaction lowering can be done. This handles shuffles

16218

// which take every Nth element for some even N. See the helper function for

16219

// details.

16220

//

16221

// We special case these as they can be particularly efficiently handled with

16222

// the PACKUSB instruction on x86 and they show up in common patterns of

16223

// rearranging bytes to truncate wide elements.

16224

if (NumEvenDrops) {

16225

// NumEvenDrops is the power of two stride of the elements. Another way of

16226

// thinking about it is that we need to drop the even elements this many

16227

// times to get the original input.

16228

16229

// First we need to zero all the dropped bytes.

16230

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16231, __extension__
__PRETTY_FUNCTION__))

16231

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16231, __extension__
__PRETTY_FUNCTION__));

16232

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

16233

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

16234

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

16235

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

16236

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

16237

WordClearMask);

16238

if (!IsSingleInput)

16239

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

16240

WordClearMask);

16241

16242

// Now pack things back together.

16243

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16244

IsSingleInput ? V1 : V2);

16245

for (int i = 1; i < NumEvenDrops; ++i) {

16246

Result = DAG.getBitcast(MVT::v8i16, Result);

16247

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

16248

}

16249

return Result;

16250

}

16251

16252

int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

16253

if (NumOddDrops == 1) {

16254

V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16255

DAG.getBitcast(MVT::v8i16, V1),

16256

DAG.getTargetConstant(8, DL, MVT::i8));

16257

if (!IsSingleInput)

16258

V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16259

DAG.getBitcast(MVT::v8i16, V2),

16260

DAG.getTargetConstant(8, DL, MVT::i8));

16261

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16262

IsSingleInput ? V1 : V2);

16263

}

16264

16265

// Handle multi-input cases by blending/unpacking single-input shuffles.

16266

if (NumV2Elements > 0)

16267

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

16268

Subtarget, DAG);

16269

16270

// The fallback path for single-input shuffles widens this into two v8i16

16271

// vectors with unpacks, shuffles those, and then pulls them back together

16272

// with a pack.

16273

SDValue V = V1;

16274

16275

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16276

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16277

for (int i = 0; i < 16; ++i)

16278

if (Mask[i] >= 0)

16279

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

16280

16281

SDValue VLoHalf, VHiHalf;

16282

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

16283

// them out and avoid using UNPCK{L,H} to extract the elements of V as

16284

// i16s.

16285

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

16286

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

16287

// Use a mask to drop the high bytes.

16288

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

16289

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

16290

DAG.getConstant(0x00FF, DL, MVT::v8i16));

16291

16292

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

16293

VHiHalf = DAG.getUNDEF(MVT::v8i16);

16294

16295

// Squash the masks to point directly into VLoHalf.

16296

for (int &M : LoBlendMask)

16297

if (M >= 0)

16298

M /= 2;

16299

for (int &M : HiBlendMask)

16300

if (M >= 0)

16301

M /= 2;

16302

} else {

16303

// Otherwise just unpack the low half of V into VLoHalf and the high half into

16304

// VHiHalf so that we can blend them as i16s.

16305

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

16306

16307

VLoHalf = DAG.getBitcast(

16308

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

16309

VHiHalf = DAG.getBitcast(

16310

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

16311

}

16312

16313

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

16314

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

16315

16316

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

16317

}

16318

16319

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

16320

///

16321

/// This routine breaks down the specific type of 128-bit shuffle and

16322

/// dispatches to the lowering routines accordingly.

16323

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

16324

MVT VT, SDValue V1, SDValue V2,

16325

const APInt &Zeroable,

16326

const X86Subtarget &Subtarget,

16327

SelectionDAG &DAG) {

16328

switch (VT.SimpleTy) {

16329

case MVT::v2i64:

16330

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16331

case MVT::v2f64:

16332

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16333

case MVT::v4i32:

16334

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16335

case MVT::v4f32:

16336

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16337

case MVT::v8i16:

16338

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16339

case MVT::v8f16:

16340

return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16341

case MVT::v16i8:

16342

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16343

16344

default:

16345

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16345);

16346

}

16347

}

16348

16349

/// Generic routine to split vector shuffle into half-sized shuffles.

16350

///

16351

/// This routine just extracts two subvectors, shuffles them independently, and

16352

/// then concatenates them back together. This should work effectively with all

16353

/// AVX vector shuffle types.

16354

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

16355

SDValue V2, ArrayRef<int> Mask,

16356

SelectionDAG &DAG) {

16357

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16358, __extension__
__PRETTY_FUNCTION__))

16358

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16358, __extension__
__PRETTY_FUNCTION__));

16359

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16359, __extension__
__PRETTY_FUNCTION__));

16360

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16360, __extension__
__PRETTY_FUNCTION__));

16361

16362

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

16363

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

16364

16365

int NumElements = VT.getVectorNumElements();

16366

int SplitNumElements = NumElements / 2;

16367

MVT ScalarVT = VT.getVectorElementType();

16368

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

16369

16370

// Use splitVector/extractSubVector so that split build-vectors just build two

16371

// narrower build vectors. This helps shuffling with splats and zeros.

16372

auto SplitVector = [&](SDValue V) {

16373

SDValue LoV, HiV;

16374

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

16375

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

16376

DAG.getBitcast(SplitVT, HiV));

16377

};

16378

16379

SDValue LoV1, HiV1, LoV2, HiV2;

16380

std::tie(LoV1, HiV1) = SplitVector(V1);

16381

std::tie(LoV2, HiV2) = SplitVector(V2);

16382

16383

// Now create two 4-way blends of these half-width vectors.

16384

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

16385

bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;

16386

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

16387

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

16388

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

16389

for (int i = 0; i < SplitNumElements; ++i) {

16390

int M = HalfMask[i];

16391

if (M >= NumElements) {

16392

if (M >= NumElements + SplitNumElements)

16393

UseHiV2 = true;

16394

else

16395

UseLoV2 = true;

16396

V2BlendMask[i] = M - NumElements;

16397

BlendMask[i] = SplitNumElements + i;

16398

} else if (M >= 0) {

16399

if (M >= SplitNumElements)

16400

UseHiV1 = true;

16401

else

16402

UseLoV1 = true;

16403

V1BlendMask[i] = M;

16404

BlendMask[i] = i;

16405

}

16406

}

16407

16408

// Because the lowering happens after all combining takes place, we need to

16409

// manually combine these blend masks as much as possible so that we create

16410

// a minimal number of high-level vector shuffle nodes.

16411

16412

// First try just blending the halves of V1 or V2.

16413

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

16414

return DAG.getUNDEF(SplitVT);

16415

if (!UseLoV2 && !UseHiV2)

16416

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16417

if (!UseLoV1 && !UseHiV1)

16418

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16419

16420

SDValue V1Blend, V2Blend;

16421

if (UseLoV1 && UseHiV1) {

16422

V1Blend =

16423

DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16424

} else {

16425

// We only use half of V1 so map the usage down into the final blend mask.

16426

V1Blend = UseLoV1 ? LoV1 : HiV1;

16427

for (int i = 0; i < SplitNumElements; ++i)

16428

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

16429

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

16430

}

16431

if (UseLoV2 && UseHiV2) {

16432

V2Blend =

16433

DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16434

} else {

16435

// We only use half of V2 so map the usage down into the final blend mask.

16436

V2Blend = UseLoV2 ? LoV2 : HiV2;

16437

for (int i = 0; i < SplitNumElements; ++i)

16438

if (BlendMask[i] >= SplitNumElements)

16439

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

16440

}

16441

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

16442

};

16443

SDValue Lo = HalfBlend(LoMask);

16444

SDValue Hi = HalfBlend(HiMask);

16445

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

16446

}

16447

16448

/// Either split a vector in halves or decompose the shuffles and the

16449

/// blend/unpack.

16450

///

16451

/// This is provided as a good fallback for many lowerings of non-single-input

16452

/// shuffles with more than one 128-bit lane. In those cases, we want to select

16453

/// between splitting the shuffle into 128-bit components and stitching those

16454

/// back together vs. extracting the single-input shuffles and blending those

16455

/// results.

16456

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

16457

SDValue V2, ArrayRef<int> Mask,

16458

const X86Subtarget &Subtarget,

16459

SelectionDAG &DAG) {

16460

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16461, __extension__
__PRETTY_FUNCTION__))

16461

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16461, __extension__
__PRETTY_FUNCTION__));

16462

int Size = Mask.size();

16463

16464

// If this can be modeled as a broadcast of two elements followed by a blend,

16465

// prefer that lowering. This is especially important because broadcasts can

16466

// often fold with memory operands.

16467

auto DoBothBroadcast = [&] {

16468

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

16469

for (int M : Mask)

16470

if (M >= Size) {

16471

if (V2BroadcastIdx < 0)

16472

V2BroadcastIdx = M - Size;

16473

else if (M - Size != V2BroadcastIdx)

16474

return false;

16475

} else if (M >= 0) {

16476

if (V1BroadcastIdx < 0)

16477

V1BroadcastIdx = M;

16478

else if (M != V1BroadcastIdx)

16479

return false;

16480

}

16481

return true;

16482

};

16483

if (DoBothBroadcast())

16484

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

16485

DAG);

16486

16487

// If the inputs all stem from a single 128-bit lane of each input, then we

16488

// split them rather than blending because the split will decompose to

16489

// unusually few instructions.

16490

int LaneCount = VT.getSizeInBits() / 128;

16491

int LaneSize = Size / LaneCount;

16492

SmallBitVector LaneInputs[2];

16493

LaneInputs[0].resize(LaneCount, false);

16494

LaneInputs[1].resize(LaneCount, false);

16495

for (int i = 0; i < Size; ++i)

16496

if (Mask[i] >= 0)

16497

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

16498

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

16499

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

16500

16501

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

16502

// requires that the decomposed single-input shuffles don't end up here.

16503

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

16504

DAG);

16505

}

16506

16507

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

16508

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

16509

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

16510

SDValue V1, SDValue V2,

16511

ArrayRef<int> Mask,

16512

SelectionDAG &DAG) {

16513

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16513, __extension__
__PRETTY_FUNCTION__));

16514

16515

int LHSMask[4] = {-1, -1, -1, -1};

16516

int RHSMask[4] = {-1, -1, -1, -1};

16517

unsigned SHUFPMask = 0;

16518

16519

// As SHUFPD uses a single LHS/RHS element per lane, we can always

16520

// perform the shuffle once the lanes have been shuffled in place.

16521

for (int i = 0; i != 4; ++i) {

16522

int M = Mask[i];

16523

if (M < 0)

16524

continue;

16525

int LaneBase = i & ~1;

16526

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

16527

LaneMask[LaneBase + (M & 1)] = M;

16528

SHUFPMask |= (M & 1) << i;

16529

}

16530

16531

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

16532

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

16533

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

16534

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

16535

}

16536

16537

/// Lower a vector shuffle crossing multiple 128-bit lanes as

16538

/// a lane permutation followed by a per-lane permutation.

16539

///

16540

/// This is mainly for cases where we can have non-repeating permutes

16541

/// in each lane.

16542

///

16543

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

16544

/// we should investigate merging them.

16545

static SDValue lowerShuffleAsLanePermuteAndPermute(

16546

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16547

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

16548

int NumElts = VT.getVectorNumElements();

16549

int NumLanes = VT.getSizeInBits() / 128;

16550

int NumEltsPerLane = NumElts / NumLanes;

16551

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

16552

16553

/// Attempts to find a sublane permute with the given size

16554

/// that gets all elements into their target lanes.

16555

///

16556

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

16557

/// If unsuccessful, returns false and may overwrite InLaneMask.

16558

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

16559

int NumSublanesPerLane = NumSublanes / NumLanes;

16560

int NumEltsPerSublane = NumElts / NumSublanes;

16561

16562

SmallVector<int, 16> CrossLaneMask;

16563

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

16564

// CrossLaneMask but one entry == one sublane.

16565

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

16566

16567

for (int i = 0; i != NumElts; ++i) {

16568

int M = Mask[i];

16569

if (M < 0)

16570

continue;

16571

16572

int SrcSublane = M / NumEltsPerSublane;

16573

int DstLane = i / NumEltsPerLane;

16574

16575

// We only need to get the elements into the right lane, not sublane.

16576

// So search all sublanes that make up the destination lane.

16577

bool Found = false;

16578

int DstSubStart = DstLane * NumSublanesPerLane;

16579

int DstSubEnd = DstSubStart + NumSublanesPerLane;

16580

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

16581

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

16582

continue;

16583

16584

Found = true;

16585

CrossLaneMaskLarge[DstSublane] = SrcSublane;

16586

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

16587

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

16588

break;

16589

}

16590

if (!Found)

16591

return SDValue();

16592

}

16593

16594

// Fill CrossLaneMask using CrossLaneMaskLarge.

16595

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

16596

16597

if (!CanUseSublanes) {

16598

// If we're only shuffling a single lowest lane and the rest are identity

16599

// then don't bother.

16600

// TODO - isShuffleMaskInputInPlace could be extended to something like

16601

// this.

16602

int NumIdentityLanes = 0;

16603

bool OnlyShuffleLowestLane = true;

16604

for (int i = 0; i != NumLanes; ++i) {

16605

int LaneOffset = i * NumEltsPerLane;

16606

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

16607

i * NumEltsPerLane))

16608

NumIdentityLanes++;

16609

else if (CrossLaneMask[LaneOffset] != 0)

16610

OnlyShuffleLowestLane = false;

16611

}

16612

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

16613

return SDValue();

16614

}

16615

16616

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

16617

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

16618

InLaneMask);

16619

};

16620

16621

// First attempt a solution with full lanes.

16622

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

16623

return V;

16624

16625

// The rest of the solutions use sublanes.

16626

if (!CanUseSublanes)

16627

return SDValue();

16628

16629

// Then attempt a solution with 64-bit sublanes (vpermq).

16630

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

16631

return V;

16632

16633

// If that doesn't work and we have fast variable cross-lane shuffle,

16634

// attempt 32-bit sublanes (vpermd).

16635

if (!Subtarget.hasFastVariableCrossLaneShuffle())

16636

return SDValue();

16637

16638

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

16639

}

16640

16641

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

16642

/// source with a lane permutation.

16643

///

16644

/// This lowering strategy results in four instructions in the worst case for a

16645

/// single-input cross lane shuffle which is lower than any other fully general

16646

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

16647

/// shuffle pattern should be handled prior to trying this lowering.

16648

static SDValue lowerShuffleAsLanePermuteAndShuffle(

16649

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16650

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

16651

// FIXME: This should probably be generalized for 512-bit vectors as well.

16652

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16652, __extension__
__PRETTY_FUNCTION__));

16653

int Size = Mask.size();

16654

int LaneSize = Size / 2;

16655

16656

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

16657

// Only do this if the elements aren't all from the lower lane,

16658

// otherwise we're (probably) better off doing a split.

16659

if (VT == MVT::v4f64 &&

16660

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

16661

if (SDValue V =

16662

lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))

16663

return V;

16664

16665

// If there are only inputs from one 128-bit lane, splitting will in fact be

16666

// less expensive. The flags track whether the given lane contains an element

16667

// that crosses to another lane.

16668

bool AllLanes;

16669

if (!Subtarget.hasAVX2()) {

16670

bool LaneCrossing[2] = {false, false};

16671

for (int i = 0; i < Size; ++i)

16672

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

16673

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

16674

AllLanes = LaneCrossing[0] && LaneCrossing[1];

16675

} else {

16676

bool LaneUsed[2] = {false, false};

16677

for (int i = 0; i < Size; ++i)

16678

if (Mask[i] >= 0)

16679

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

16680

AllLanes = LaneUsed[0] && LaneUsed[1];

16681

}

16682

16683

// TODO - we could support shuffling V2 in the Flipped input.

16684

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16685, __extension__
__PRETTY_FUNCTION__))

16685

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16685, __extension__
__PRETTY_FUNCTION__));

16686

16687

SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());

16688

for (int i = 0; i < Size; ++i) {

16689

int &M = InLaneMask[i];

16690

if (M < 0)

16691

continue;

16692

if (((M % Size) / LaneSize) != (i / LaneSize))

16693

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

16694

}

16695

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16696, __extension__
__PRETTY_FUNCTION__))

16696

"In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16696, __extension__
__PRETTY_FUNCTION__));

16697

16698

// If we're not using both lanes in each lane and the inlane mask is not

16699

// repeating, then we're better off splitting.

16700

if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

16701

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

16702

16703

// Flip the lanes, and shuffle the results which should now be in-lane.

16704

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

16705

SDValue Flipped = DAG.getBitcast(PVT, V1);

16706

Flipped =

16707

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

16708

Flipped = DAG.getBitcast(VT, Flipped);

16709

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

16710

}

16711

16712

/// Handle lowering 2-lane 128-bit shuffles.

16713

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

16714

SDValue V2, ArrayRef<int> Mask,

16715

const APInt &Zeroable,

16716

const X86Subtarget &Subtarget,

16717

SelectionDAG &DAG) {

16718

if (V2.isUndef()) {

16719

// Attempt to match VBROADCAST*128 subvector broadcast load.

16720

bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

16721

bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

16722

if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

16723

X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

16724

MVT MemVT = VT.getHalfNumVectorElementsVT();

16725

unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

16726

auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

16727

if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

16728

VT, MemVT, Ld, Ofs, DAG))

16729

return BcstLd;

16730

}

16731

16732

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

16733

if (Subtarget.hasAVX2())

16734

return SDValue();

16735

}

16736

16737

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

16738

16739

SmallVector<int, 4> WidenedMask;

16740

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

16741

return SDValue();

16742

16743

bool IsLowZero = (Zeroable & 0x3) == 0x3;

16744

bool IsHighZero = (Zeroable & 0xc) == 0xc;

16745

16746

// Try to use an insert into a zero vector.

16747

if (WidenedMask[0] == 0 && IsHighZero) {

16748

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

16749

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

16750

DAG.getIntPtrConstant(0, DL));

16751

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

16752

getZeroVector(VT, Subtarget, DAG, DL), LoV,

16753

DAG.getIntPtrConstant(0, DL));

16754

}

16755

16756

// TODO: If minimizing size and one of the inputs is a zero vector and the

16757

// the zero vector has only one use, we could use a VPERM2X128 to save the

16758

// instruction bytes needed to explicitly generate the zero vector.

16759

16760

// Blends are faster and handle all the non-lane-crossing cases.

16761

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

16762

Subtarget, DAG))

16763

return Blend;

16764

16765

// If either input operand is a zero vector, use VPERM2X128 because its mask

16766

// allows us to replace the zero input with an implicit zero.

16767

if (!IsLowZero && !IsHighZero) {

16768

// Check for patterns which can be matched with a single insert of a 128-bit

16769

// subvector.

16770

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

16771

if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {

16772

16773

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

16774

// this will likely become vinsertf128 which can't fold a 256-bit memop.

16775

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

16776

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

16777

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

16778

OnlyUsesV1 ? V1 : V2,

16779

DAG.getIntPtrConstant(0, DL));

16780

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

16781

DAG.getIntPtrConstant(2, DL));

16782

}

16783

}

16784

16785

// Try to use SHUF128 if possible.

16786

if (Subtarget.hasVLX()) {

16787

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

16788

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

16789

((WidenedMask[1] % 2) << 1);

16790

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

16791

DAG.getTargetConstant(PermMask, DL, MVT::i8));

16792

}

16793

}

16794

}

16795

16796

// Otherwise form a 128-bit permutation. After accounting for undefs,

16797

// convert the 64-bit shuffle mask selection values into 128-bit

16798

// selection bits by dividing the indexes by 2 and shifting into positions

16799

// defined by a vperm2*128 instruction's immediate control byte.

16800

16801

// The immediate permute control byte looks like this:

16802

// [1:0] - select 128 bits from sources for low half of destination

16803

// [2] - ignore

16804

// [3] - zero low half of destination

16805

// [5:4] - select 128 bits from sources for high half of destination

16806

// [6] - ignore

16807

// [7] - zero high half of destination

16808

16809

assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))

16810

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__));

16811

16812

unsigned PermMask = 0;

16813

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

16814

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

16815

16816

// Check the immediate mask and replace unused sources with undef.

16817

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

16818

V1 = DAG.getUNDEF(VT);

16819

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

16820

V2 = DAG.getUNDEF(VT);

16821

16822

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

16823

DAG.getTargetConstant(PermMask, DL, MVT::i8));

16824

}

16825

16826

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

16827

/// shuffling each lane.

16828

///

16829

/// This attempts to create a repeated lane shuffle where each lane uses one

16830

/// or two of the lanes of the inputs. The lanes of the input vectors are

16831

/// shuffled in one or two independent shuffles to get the lanes into the

16832

/// position needed by the final shuffle.

16833

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

16834

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16835

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

16836

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16836, __extension__
__PRETTY_FUNCTION__));

16837

16838

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

16839

return SDValue();

16840

16841

int NumElts = Mask.size();

16842

int NumLanes = VT.getSizeInBits() / 128;

16843

int NumLaneElts = 128 / VT.getScalarSizeInBits();

16844

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

16845

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

16846

16847

// First pass will try to fill in the RepeatMask from lanes that need two

16848

// sources.

16849

for (int Lane = 0; Lane != NumLanes; ++Lane) {

16850

int Srcs[2] = {-1, -1};

16851

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

16852

for (int i = 0; i != NumLaneElts; ++i) {

16853

int M = Mask[(Lane * NumLaneElts) + i];

16854

if (M < 0)

16855

continue;

16856

// Determine which of the possible input lanes (NumLanes from each source)

16857

// this element comes from. Assign that as one of the sources for this

16858

// lane. We can assign up to 2 sources for this lane. If we run out

16859

// sources we can't do anything.

16860

int LaneSrc = M / NumLaneElts;

16861

int Src;

16862

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

16863

Src = 0;

16864

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

16865

Src = 1;

16866

else

16867

return SDValue();

16868

16869

Srcs[Src] = LaneSrc;

16870

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

16871

}

16872

16873

// If this lane has two sources, see if it fits with the repeat mask so far.

16874

if (Srcs[1] < 0)

16875

continue;

16876

16877

LaneSrcs[Lane][0] = Srcs[0];

16878

LaneSrcs[Lane][1] = Srcs[1];

16879

16880

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

16881

assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16881, __extension__
__PRETTY_FUNCTION__));

16882

for (int i = 0, e = M1.size(); i != e; ++i)

16883

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

16884

return false;

16885

return true;

16886

};

16887

16888

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

16889

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16889, __extension__
__PRETTY_FUNCTION__));

16890

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

16891

int M = Mask[i];

16892

if (M < 0)

16893

continue;

16894

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16895, __extension__
__PRETTY_FUNCTION__))

16895

"Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16895, __extension__
__PRETTY_FUNCTION__));

16896

MergedMask[i] = M;

16897

}

16898

};

16899

16900

if (MatchMasks(InLaneMask, RepeatMask)) {

16901

// Merge this lane mask into the final repeat mask.

16902

MergeMasks(InLaneMask, RepeatMask);

16903

continue;

16904

}

16905

16906

// Didn't find a match. Swap the operands and try again.

16907

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

16908

ShuffleVectorSDNode::commuteMask(InLaneMask);

16909

16910

if (MatchMasks(InLaneMask, RepeatMask)) {

16911

// Merge this lane mask into the final repeat mask.

16912

MergeMasks(InLaneMask, RepeatMask);

16913

continue;

16914

}

16915

16916

// Couldn't find a match with the operands in either order.

16917

return SDValue();

16918

}

16919

16920

// Now handle any lanes with only one source.

16921

for (int Lane = 0; Lane != NumLanes; ++Lane) {

16922

// If this lane has already been processed, skip it.

16923

if (LaneSrcs[Lane][0] >= 0)

16924

continue;

16925

16926

for (int i = 0; i != NumLaneElts; ++i) {

16927

int M = Mask[(Lane * NumLaneElts) + i];

16928

if (M < 0)

16929

continue;

16930

16931

// If RepeatMask isn't defined yet we can define it ourself.

16932

if (RepeatMask[i] < 0)

16933

RepeatMask[i] = M % NumLaneElts;

16934

16935

if (RepeatMask[i] < NumElts) {

16936

if (RepeatMask[i] != M % NumLaneElts)

16937

return SDValue();

16938

LaneSrcs[Lane][0] = M / NumLaneElts;

16939

} else {

16940

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

16941

return SDValue();

16942

LaneSrcs[Lane][1] = M / NumLaneElts;

16943

}

16944

}

16945

16946

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

16947

return SDValue();

16948

}

16949

16950

SmallVector<int, 16> NewMask(NumElts, -1);

16951

for (int Lane = 0; Lane != NumLanes; ++Lane) {

16952

int Src = LaneSrcs[Lane][0];

16953

for (int i = 0; i != NumLaneElts; ++i) {

16954

int M = -1;

16955

if (Src >= 0)

16956

M = Src * NumLaneElts + i;

16957

NewMask[Lane * NumLaneElts + i] = M;

16958

}

16959

}

16960

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

16961

// Ensure we didn't get back the shuffle we started with.

16962

// FIXME: This is a hack to make up for some splat handling code in

16963

// getVectorShuffle.

16964

if (isa<ShuffleVectorSDNode>(NewV1) &&

16965

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

16966

return SDValue();

16967

16968

for (int Lane = 0; Lane != NumLanes; ++Lane) {

16969

int Src = LaneSrcs[Lane][1];

16970

for (int i = 0; i != NumLaneElts; ++i) {

16971

int M = -1;

16972

if (Src >= 0)

16973

M = Src * NumLaneElts + i;

16974

NewMask[Lane * NumLaneElts + i] = M;

16975

}

16976

}

16977

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

16978

// Ensure we didn't get back the shuffle we started with.

16979

// FIXME: This is a hack to make up for some splat handling code in

16980

// getVectorShuffle.

16981

if (isa<ShuffleVectorSDNode>(NewV2) &&

16982

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

16983

return SDValue();

16984

16985

for (int i = 0; i != NumElts; ++i) {

16986

NewMask[i] = RepeatMask[i % NumLaneElts];

16987

if (NewMask[i] < 0)

16988

continue;

16989

16990

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

16991

}

16992

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

16993

}

16994

16995

/// If the input shuffle mask results in a vector that is undefined in all upper

16996

/// or lower half elements and that mask accesses only 2 halves of the

16997

/// shuffle's operands, return true. A mask of half the width with mask indexes

16998

/// adjusted to access the extracted halves of the original shuffle operands is

16999

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

17000

/// lower half of each input operand is accessed.

17001

static bool

17002

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

17003

int &HalfIdx1, int &HalfIdx2) {

17004

assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17005, __extension__
__PRETTY_FUNCTION__))

17005

"Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17005, __extension__
__PRETTY_FUNCTION__));

17006

17007

// Exactly one half of the result must be undef to allow narrowing.

17008

bool UndefLower = isUndefLowerHalf(Mask);

17009

bool UndefUpper = isUndefUpperHalf(Mask);

17010

if (UndefLower == UndefUpper)

17011

return false;

17012

17013

unsigned HalfNumElts = HalfMask.size();

17014

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

17015

HalfIdx1 = -1;

17016

HalfIdx2 = -1;

17017

for (unsigned i = 0; i != HalfNumElts; ++i) {

17018

int M = Mask[i + MaskIndexOffset];

17019

if (M < 0) {

17020

HalfMask[i] = M;

17021

continue;

17022

}

17023

17024

// Determine which of the 4 half vectors this element is from.

17025

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

17026

int HalfIdx = M / HalfNumElts;

17027

17028

// Determine the element index into its half vector source.

17029

int HalfElt = M % HalfNumElts;

17030

17031

// We can shuffle with up to 2 half vectors, set the new 'half'

17032

// shuffle mask accordingly.

17033

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

17034

HalfMask[i] = HalfElt;

17035

HalfIdx1 = HalfIdx;

17036

continue;

17037

}

17038

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

17039

HalfMask[i] = HalfElt + HalfNumElts;

17040

HalfIdx2 = HalfIdx;

17041

continue;

17042

}

17043

17044

// Too many half vectors referenced.

17045

return false;

17046

}

17047

17048

return true;

17049

}

17050

17051

/// Given the output values from getHalfShuffleMask(), create a half width

17052

/// shuffle of extracted vectors followed by an insert back to full width.

17053

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

17054

ArrayRef<int> HalfMask, int HalfIdx1,

17055

int HalfIdx2, bool UndefLower,

17056

SelectionDAG &DAG, bool UseConcat = false) {

17057

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17057, __extension__
__PRETTY_FUNCTION__));

17058

assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17058, __extension__
__PRETTY_FUNCTION__));

17059

17060

MVT VT = V1.getSimpleValueType();

17061

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17062

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17063

17064

auto getHalfVector = [&](int HalfIdx) {

17065

if (HalfIdx < 0)

17066

return DAG.getUNDEF(HalfVT);

17067

SDValue V = (HalfIdx < 2 ? V1 : V2);

17068

HalfIdx = (HalfIdx % 2) * HalfNumElts;

17069

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

17070

DAG.getIntPtrConstant(HalfIdx, DL));

17071

};

17072

17073

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

17074

SDValue Half1 = getHalfVector(HalfIdx1);

17075

SDValue Half2 = getHalfVector(HalfIdx2);

17076

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

17077

if (UseConcat) {

17078

SDValue Op0 = V;

17079

SDValue Op1 = DAG.getUNDEF(HalfVT);

17080

if (UndefLower)

17081

std::swap(Op0, Op1);

17082

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

17083

}

17084

17085

unsigned Offset = UndefLower ? HalfNumElts : 0;

17086

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

17087

DAG.getIntPtrConstant(Offset, DL));

17088

}

17089

17090

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

17091

/// This allows for fast cases such as subvector extraction/insertion

17092

/// or shuffling smaller vector types which can lower more efficiently.

17093

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

17094

SDValue V2, ArrayRef<int> Mask,

17095

const X86Subtarget &Subtarget,

17096

SelectionDAG &DAG) {

17097

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17098, __extension__
__PRETTY_FUNCTION__))

17098

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17098, __extension__
__PRETTY_FUNCTION__));

17099

17100

bool UndefLower = isUndefLowerHalf(Mask);

17101

if (!UndefLower && !isUndefUpperHalf(Mask))

17102

return SDValue();

17103

17104

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17105, __extension__
__PRETTY_FUNCTION__))

17105

"Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17105, __extension__
__PRETTY_FUNCTION__));

17106

17107

// Upper half is undef and lower half is whole upper subvector.

17108

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

17109

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17110

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17111

if (!UndefLower &&

17112

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

17113

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17114

DAG.getIntPtrConstant(HalfNumElts, DL));

17115

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17116

DAG.getIntPtrConstant(0, DL));

17117

}

17118

17119

// Lower half is undef and upper half is whole lower subvector.

17120

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

17121

if (UndefLower &&

17122

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

17123

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17124

DAG.getIntPtrConstant(0, DL));

17125

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17126

DAG.getIntPtrConstant(HalfNumElts, DL));

17127

}

17128

17129

int HalfIdx1, HalfIdx2;

17130

SmallVector<int, 8> HalfMask(HalfNumElts);

17131

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

17132

return SDValue();

17133

17134

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17134, __extension__
__PRETTY_FUNCTION__));

17135

17136

// Only shuffle the halves of the inputs when useful.

17137

unsigned NumLowerHalves =

17138

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

17139

unsigned NumUpperHalves =

17140

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

17141

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__));

17142

17143

// Determine the larger pattern of undef/halves, then decide if it's worth

17144

// splitting the shuffle based on subtarget capabilities and types.

17145

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

17146

if (!UndefLower) {

17147

// XXXXuuuu: no insert is needed.

17148

// Always extract lowers when setting lower - these are all free subreg ops.

17149

if (NumUpperHalves == 0)

17150

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17151

UndefLower, DAG);

17152

17153

if (NumUpperHalves == 1) {

17154

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

17155

if (Subtarget.hasAVX2()) {

17156

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

17157

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

17158

!is128BitUnpackShuffleMask(HalfMask) &&

17159

(!isSingleSHUFPSMask(HalfMask) ||

17160

Subtarget.hasFastVariableCrossLaneShuffle()))

17161

return SDValue();

17162

// If this is a unary shuffle (assume that the 2nd operand is

17163

// canonicalized to undef), then we can use vpermpd. Otherwise, we

17164

// are better off extracting the upper half of 1 operand and using a

17165

// narrow shuffle.

17166

if (EltWidth == 64 && V2.isUndef())

17167

return SDValue();

17168

}

17169

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17170

if (Subtarget.hasAVX512() && VT.is512BitVector())

17171

return SDValue();

17172

// Extract + narrow shuffle is better than the wide alternative.

17173

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17174

UndefLower, DAG);

17175

}

17176

17177

// Don't extract both uppers, instead shuffle and then extract.

17178

assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17178, __extension__
__PRETTY_FUNCTION__));

17179

return SDValue();

17180

}

17181

17182

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

17183

if (NumUpperHalves == 0) {

17184

// AVX2 has efficient 64-bit element cross-lane shuffles.

17185

// TODO: Refine to account for unary shuffle, splat, and other masks?

17186

if (Subtarget.hasAVX2() && EltWidth == 64)

17187

return SDValue();

17188

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17189

if (Subtarget.hasAVX512() && VT.is512BitVector())

17190

return SDValue();

17191

// Narrow shuffle + insert is better than the wide alternative.

17192

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17193

UndefLower, DAG);

17194

}

17195

17196

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

17197

return SDValue();

17198

}

17199

17200

/// Test whether the specified input (0 or 1) is in-place blended by the

17201

/// given mask.

17202

///

17203

/// This returns true if the elements from a particular input are already in the

17204

/// slot required by the given mask and require no permutation.

17205

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

17206

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17206, __extension__
__PRETTY_FUNCTION__));

17207

int Size = Mask.size();

17208

for (int i = 0; i < Size; ++i)

17209

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

17210

return false;

17211

17212

return true;

17213

}

17214

17215

/// Handle case where shuffle sources are coming from the same 128-bit lane and

17216

/// every lane can be represented as the same repeating mask - allowing us to

17217

/// shuffle the sources with the repeating shuffle and then permute the result

17218

/// to the destination lanes.

17219

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

17220

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17221

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17222

int NumElts = VT.getVectorNumElements();

17223

int NumLanes = VT.getSizeInBits() / 128;

17224

int NumLaneElts = NumElts / NumLanes;

17225

17226

// On AVX2 we may be able to just shuffle the lowest elements and then

17227

// broadcast the result.

17228

if (Subtarget.hasAVX2()) {

17229

for (unsigned BroadcastSize : {16, 32, 64}) {

17230

if (BroadcastSize <= VT.getScalarSizeInBits())

17231

continue;

17232

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

17233

17234

// Attempt to match a repeating pattern every NumBroadcastElts,

17235

// accounting for UNDEFs but only references the lowest 128-bit

17236

// lane of the inputs.

17237

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

17238

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17239

for (int j = 0; j != NumBroadcastElts; ++j) {

17240

int M = Mask[i + j];

17241

if (M < 0)

17242

continue;

17243

int &R = RepeatMask[j];

17244

if (0 != ((M % NumElts) / NumLaneElts))

17245

return false;

17246

if (0 <= R && R != M)

17247

return false;

17248

R = M;

17249

}

17250

return true;

17251

};

17252

17253

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

17254

if (!FindRepeatingBroadcastMask(RepeatMask))

17255

continue;

17256

17257

// Shuffle the (lowest) repeated elements in place for broadcast.

17258

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

17259

17260

// Shuffle the actual broadcast.

17261

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

17262

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17263

for (int j = 0; j != NumBroadcastElts; ++j)

17264

BroadcastMask[i + j] = j;

17265

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

17266

BroadcastMask);

17267

}

17268

}

17269

17270

// Bail if the shuffle mask doesn't cross 128-bit lanes.

17271

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

17272

return SDValue();

17273

17274

// Bail if we already have a repeated lane shuffle mask.

17275

SmallVector<int, 8> RepeatedShuffleMask;

17276

if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))

17277

return SDValue();

17278

17279

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

17280

// (with PERMQ/PERMPD). On AVX512BW targets, permuting 32-bit sub-lanes, even

17281

// with a variable shuffle, is worth it for 64xi8 vectors. Otherwise we can

17282

// only permute whole 128-bit lanes.

17283

int SubLaneScale = 1;

17284

if (Subtarget.hasAVX2() && VT.is256BitVector())

17285

SubLaneScale = 2;

17286

if (Subtarget.hasBWI() && VT == MVT::v64i8)

17287

SubLaneScale = 4;

17288

int NumSubLanes = NumLanes * SubLaneScale;

17289

int NumSubLaneElts = NumLaneElts / SubLaneScale;

17290

17291

// Check that all the sources are coming from the same lane and see if we can

17292

// form a repeating shuffle mask (local to each sub-lane). At the same time,

17293

// determine the source sub-lane for each destination sub-lane.

17294

int TopSrcSubLane = -1;

17295

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

17296

SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

17297

SubLaneScale,

17298

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));

17299

17300

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

17301

// Extract the sub-lane mask, check that it all comes from the same lane

17302

// and normalize the mask entries to come from the first lane.

17303

int SrcLane = -1;

17304

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

17305

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17306

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

17307

if (M < 0)

17308

continue;

17309

int Lane = (M % NumElts) / NumLaneElts;

17310

if ((0 <= SrcLane) && (SrcLane != Lane))

17311

return SDValue();

17312

SrcLane = Lane;

17313

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

17314

SubLaneMask[Elt] = LocalM;

17315

}

17316

17317

// Whole sub-lane is UNDEF.

17318

if (SrcLane < 0)

17319

continue;

17320

17321

// Attempt to match against the candidate repeated sub-lane masks.

17322

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

17323

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

17324

for (int i = 0; i != NumSubLaneElts; ++i) {

17325

if (M1[i] < 0 || M2[i] < 0)

17326

continue;

17327

if (M1[i] != M2[i])

17328

return false;

17329

}

17330

return true;

17331

};

17332

17333

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

17334

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

17335

continue;

17336

17337

// Merge the sub-lane mask into the matching repeated sub-lane mask.

17338

for (int i = 0; i != NumSubLaneElts; ++i) {

17339

int M = SubLaneMask[i];

17340

if (M < 0)

17341

continue;

17342

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17343, __extension__
__PRETTY_FUNCTION__))

17343

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17343, __extension__
__PRETTY_FUNCTION__));

17344

RepeatedSubLaneMask[i] = M;

17345

}

17346

17347

// Track the top most source sub-lane - by setting the remaining to UNDEF

17348

// we can greatly simplify shuffle matching.

17349

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

17350

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

17351

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

17352

break;

17353

}

17354

17355

// Bail if we failed to find a matching repeated sub-lane mask.

17356

if (Dst2SrcSubLanes[DstSubLane] < 0)

17357

return SDValue();

17358

}

17359

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17360, __extension__
__PRETTY_FUNCTION__))

17360

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17360, __extension__
__PRETTY_FUNCTION__));

17361

17362

// Create a repeating shuffle mask for the entire vector.

17363

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

17364

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

17365

int Lane = SubLane / SubLaneScale;

17366

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

17367

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17368

int M = RepeatedSubLaneMask[Elt];

17369

if (M < 0)

17370

continue;

17371

int Idx = (SubLane * NumSubLaneElts) + Elt;

17372

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

17373

}

17374

}

17375

SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

17376

17377

// Shuffle each source sub-lane to its destination.

17378

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

17379

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

17380

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

17381

if (SrcSubLane < 0)

17382

continue;

17383

for (int j = 0; j != NumSubLaneElts; ++j)

17384

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

17385

}

17386

17387

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

17388

SubLaneMask);

17389

}

17390

17391

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

17392

bool &ForceV1Zero, bool &ForceV2Zero,

17393

unsigned &ShuffleImm, ArrayRef<int> Mask,

17394

const APInt &Zeroable) {

17395

int NumElts = VT.getVectorNumElements();

17396

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))

17397

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__))

17398

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17398, __extension__
__PRETTY_FUNCTION__));

17399

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17400, __extension__
__PRETTY_FUNCTION__))

17400

"Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17400, __extension__
__PRETTY_FUNCTION__));

17401

17402

bool ZeroLane[2] = { true, true };

17403

for (int i = 0; i < NumElts; ++i)

17404

ZeroLane[i & 1] &= Zeroable[i];

17405

17406

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

17407

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

17408

ShuffleImm = 0;

17409

bool ShufpdMask = true;

17410

bool CommutableMask = true;

17411

for (int i = 0; i < NumElts; ++i) {

17412

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

17413

continue;

17414

if (Mask[i] < 0)

17415

return false;

17416

int Val = (i & 6) + NumElts * (i & 1);

17417

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

17418

if (Mask[i] < Val || Mask[i] > Val + 1)

17419

ShufpdMask = false;

17420

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

17421

CommutableMask = false;

17422

ShuffleImm |= (Mask[i] % 2) << i;

17423

}

17424

17425

if (!ShufpdMask && !CommutableMask)

17426

return false;

17427

17428

if (!ShufpdMask && CommutableMask)

17429

std::swap(V1, V2);

17430

17431

ForceV1Zero = ZeroLane[0];

17432

ForceV2Zero = ZeroLane[1];

17433

return true;

17434

}

17435

17436

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

17437

SDValue V2, ArrayRef<int> Mask,

17438

const APInt &Zeroable,

17439

const X86Subtarget &Subtarget,

17440

SelectionDAG &DAG) {

17441

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17442, __extension__
__PRETTY_FUNCTION__))

17442

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17442, __extension__
__PRETTY_FUNCTION__));

17443

17444

unsigned Immediate = 0;

17445

bool ForceV1Zero = false, ForceV2Zero = false;

17446

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

17447

Mask, Zeroable))

17448

return SDValue();

17449

17450

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

17451

if (ForceV1Zero)

17452

V1 = getZeroVector(VT, Subtarget, DAG, DL);

17453

if (ForceV2Zero)

17454

V2 = getZeroVector(VT, Subtarget, DAG, DL);

17455

17456

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

17457

DAG.getTargetConstant(Immediate, DL, MVT::i8));

17458

}

17459

17460

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

17461

// by zeroable elements in the remaining 24 elements. Turn this into two

17462

// vmovqb instructions shuffled together.

17463

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

17464

SDValue V1, SDValue V2,

17465

ArrayRef<int> Mask,

17466

const APInt &Zeroable,

17467

SelectionDAG &DAG) {

17468

assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17468, __extension__
__PRETTY_FUNCTION__));

17469

17470

// The first 8 indices should be every 8th element.

17471

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

17472

return SDValue();

17473

17474

// Remaining elements need to be zeroable.

17475

if (Zeroable.countLeadingOnes() < (Mask.size() - 8))

17476

return SDValue();

17477

17478

V1 = DAG.getBitcast(MVT::v4i64, V1);

17479

V2 = DAG.getBitcast(MVT::v4i64, V2);

17480

17481

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

17482

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

17483

17484

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

17485

// the upper bits of the result using an unpckldq.

17486

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

17487

{ 0, 1, 2, 3, 16, 17, 18, 19,

17488

4, 5, 6, 7, 20, 21, 22, 23 });

17489

// Insert the unpckldq into a zero vector to widen to v32i8.

17490

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

17491

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

17492

DAG.getIntPtrConstant(0, DL));

17493

}

17494

17495

17496

/// Handle lowering of 4-lane 64-bit floating point shuffles.

17497

///

17498

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

17499

/// isn't available.

17500

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17501

const APInt &Zeroable, SDValue V1, SDValue V2,

17502

const X86Subtarget &Subtarget,

17503

SelectionDAG &DAG) {

17504

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17504, __extension__
__PRETTY_FUNCTION__));

17505

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17505, __extension__
__PRETTY_FUNCTION__));

17506

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17506, __extension__
__PRETTY_FUNCTION__));

17507

17508

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

17509

Subtarget, DAG))

17510

return V;

17511

17512

if (V2.isUndef()) {

17513

// Check for being able to broadcast a single element.

17514

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

17515

Mask, Subtarget, DAG))

17516

return Broadcast;

17517

17518

// Use low duplicate instructions for masks that match their pattern.

17519

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

17520

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

17521

17522

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

17523

// Non-half-crossing single input shuffles can be lowered with an

17524

// interleaved permutation.

17525

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

17526

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

17527

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

17528

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

17529

}

17530

17531

// With AVX2 we have direct support for this permutation.

17532

if (Subtarget.hasAVX2())

17533

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

17534

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

17535

17536

// Try to create an in-lane repeating shuffle mask and then shuffle the

17537

// results into the target lanes.

17538

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17539

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

17540

return V;

17541

17542

// Try to permute the lanes and then use a per-lane permute.

17543

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

17544

Mask, DAG, Subtarget))

17545

return V;

17546

17547

// Otherwise, fall back.

17548

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

17549

DAG, Subtarget);

17550

}

17551

17552

// Use dedicated unpack instructions for masks that match their pattern.

17553

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

17554

return V;

17555

17556

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

17557

Zeroable, Subtarget, DAG))

17558

return Blend;

17559

17560

// Check if the blend happens to exactly fit that of SHUFPD.

17561

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

17562

Zeroable, Subtarget, DAG))

17563

return Op;

17564

17565

// If we have lane crossing shuffles AND they don't all come from the lower

17566

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17567

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

17568

// canonicalize to a blend of splat which isn't necessary for this combine.

17569

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

17570

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

17571

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

17572

(V2.getOpcode() != ISD::BUILD_VECTOR))

17573

if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,

17574

Mask, DAG))

17575

return Op;

17576

17577

// If we have one input in place, then we can permute the other input and

17578

// blend the result.

17579

if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))

17580

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

17581

Subtarget, DAG);

17582

17583

// Try to create an in-lane repeating shuffle mask and then shuffle the

17584

// results into the target lanes.

17585

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17586

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

17587

return V;

17588

17589

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17590

// shuffle. However, if we have AVX2 and either inputs are already in place,

17591

// we will be able to shuffle even across lanes the other input in a single

17592

// instruction so skip this pattern.

17593

if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||

17594

isShuffleMaskInputInPlace(1, Mask))))

17595

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

17596

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

17597

return V;

17598

17599

// If we have VLX support, we can use VEXPAND.

17600

if (Subtarget.hasVLX())

17601

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

17602

DAG, Subtarget))

17603

return V;

17604

17605

// If we have AVX2 then we always want to lower with a blend because an v4 we

17606

// can fully permute the elements.

17607

if (Subtarget.hasAVX2())

17608

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

17609

Subtarget, DAG);

17610

17611

// Otherwise fall back on generic lowering.

17612

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

17613

Subtarget, DAG);

17614

}

17615

17616

/// Handle lowering of 4-lane 64-bit integer shuffles.

17617

///

17618

/// This routine is only called when we have AVX2 and thus a reasonable

17619

/// instruction set for v4i64 shuffling..

17620

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17621

const APInt &Zeroable, SDValue V1, SDValue V2,

17622

const X86Subtarget &Subtarget,

17623

SelectionDAG &DAG) {

17624

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17624, __extension__
__PRETTY_FUNCTION__));

17625

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17625, __extension__
__PRETTY_FUNCTION__));

17626

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17626, __extension__
__PRETTY_FUNCTION__));

17627

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__));

17628

17629

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

17630

Subtarget, DAG))

17631

return V;

17632

17633

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

17634

Zeroable, Subtarget, DAG))

17635

return Blend;

17636

17637

// Check for being able to broadcast a single element.

17638

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

17639

Subtarget, DAG))

17640

return Broadcast;

17641

17642

if (V2.isUndef()) {

17643

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

17644

// can use lower latency instructions that will operate on both lanes.

17645

SmallVector<int, 2> RepeatedMask;

17646

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

17647

SmallVector<int, 4> PSHUFDMask;

17648

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

17649

return DAG.getBitcast(

17650

MVT::v4i64,

17651

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

17652

DAG.getBitcast(MVT::v8i32, V1),

17653

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

17654

}

17655

17656

// AVX2 provides a direct instruction for permuting a single input across

17657

// lanes.

17658

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

17659

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

17660

}

17661

17662

// Try to use shift instructions.

17663

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,

17664

Zeroable, Subtarget, DAG))

17665

return Shift;

17666

17667

// If we have VLX support, we can use VALIGN or VEXPAND.

17668

if (Subtarget.hasVLX()) {

17669

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

17670

Subtarget, DAG))

17671

return Rotate;

17672

17673

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

17674

DAG, Subtarget))

17675

return V;

17676

}

17677

17678

// Try to use PALIGNR.

17679

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

17680

Subtarget, DAG))

17681

return Rotate;

17682

17683

// Use dedicated unpack instructions for masks that match their pattern.

17684

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

17685

return V;

17686

17687

// If we have one input in place, then we can permute the other input and

17688

// blend the result.

17689

if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))

17690

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

17691

Subtarget, DAG);

17692

17693

// Try to create an in-lane repeating shuffle mask and then shuffle the

17694

// results into the target lanes.

17695

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17696

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

17697

return V;

17698

17699

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17700

// shuffle. However, if we have AVX2 and either inputs are already in place,

17701

// we will be able to shuffle even across lanes the other input in a single

17702

// instruction so skip this pattern.

17703

if (!isShuffleMaskInputInPlace(0, Mask) &&

17704

!isShuffleMaskInputInPlace(1, Mask))

17705

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

17706

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

17707

return Result;

17708

17709

// Otherwise fall back on generic blend lowering.

17710

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

17711

Subtarget, DAG);

17712

}

17713

17714

/// Handle lowering of 8-lane 32-bit floating point shuffles.

17715

///

17716

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

17717

/// isn't available.

17718

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17719

const APInt &Zeroable, SDValue V1, SDValue V2,

17720

const X86Subtarget &Subtarget,

17721

SelectionDAG &DAG) {

17722

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17722, __extension__
__PRETTY_FUNCTION__));

17723

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17723, __extension__
__PRETTY_FUNCTION__));

17724

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17724, __extension__
__PRETTY_FUNCTION__));

17725

17726

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

17727

Zeroable, Subtarget, DAG))

17728

return Blend;

17729

17730

// Check for being able to broadcast a single element.

17731

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

17732

Subtarget, DAG))

17733

return Broadcast;

17734

17735

// If the shuffle mask is repeated in each 128-bit lane, we have many more

17736

// options to efficiently lower the shuffle.

17737

SmallVector<int, 4> RepeatedMask;

17738

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

17739

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17740, __extension__
__PRETTY_FUNCTION__))

17740

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17740, __extension__
__PRETTY_FUNCTION__));

17741

17742

// Use even/odd duplicate instructions for masks that match their pattern.

17743

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

17744

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

17745

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

17746

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

17747

17748

if (V2.isUndef())

17749

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

17750

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

17751

17752

// Use dedicated unpack instructions for masks that match their pattern.

17753

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

17754

return V;

17755

17756

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

17757

// have already handled any direct blends.

17758

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

17759

}

17760

17761

// Try to create an in-lane repeating shuffle mask and then shuffle the

17762

// results into the target lanes.

17763

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17764

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

17765

return V;

17766

17767

// If we have a single input shuffle with different shuffle patterns in the

17768

// two 128-bit lanes use the variable mask to VPERMILPS.

17769

if (V2.isUndef()) {

17770

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

17771

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

17772

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

17773

}

17774

if (Subtarget.hasAVX2()) {

17775

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

17776

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

17777

}

17778

// Otherwise, fall back.

17779

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

17780

DAG, Subtarget);

17781

}

17782

17783

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17784

// shuffle.

17785

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

17786

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

17787

return Result;

17788

17789

// If we have VLX support, we can use VEXPAND.

17790

if (Subtarget.hasVLX())

17791

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

17792

DAG, Subtarget))

17793

return V;

17794

17795

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

17796

// since after split we get a more efficient code using vpunpcklwd and

17797

// vpunpckhwd instrs than vblend.

17798

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))

17799

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

17800

DAG);

17801

17802

// If we have AVX2 then we always want to lower with a blend because at v8 we

17803

// can fully permute the elements.

17804

if (Subtarget.hasAVX2())

17805

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

17806

Subtarget, DAG);

17807

17808

// Otherwise fall back on generic lowering.

17809

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

17810

Subtarget, DAG);

17811

}

17812

17813

/// Handle lowering of 8-lane 32-bit integer shuffles.

17814

///

17815

/// This routine is only called when we have AVX2 and thus a reasonable

17816

/// instruction set for v8i32 shuffling..

17817

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17818

const APInt &Zeroable, SDValue V1, SDValue V2,

17819

const X86Subtarget &Subtarget,

17820

SelectionDAG &DAG) {

17821

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__
__PRETTY_FUNCTION__));

17822

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17822, __extension__
__PRETTY_FUNCTION__));

17823

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17823, __extension__
__PRETTY_FUNCTION__));

17824

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17824, __extension__
__PRETTY_FUNCTION__));

17825

17826

// Whenever we can lower this as a zext, that instruction is strictly faster

17827

// than any alternative. It also allows us to fold memory operands into the

17828

// shuffle in many cases.

17829

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

17830

Zeroable, Subtarget, DAG))

17831

return ZExt;

17832

17833

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

17834

// since after split we get a more efficient code than vblend by using

17835

// vpunpcklwd and vpunpckhwd instrs.

17836

if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&

17837

!Subtarget.hasAVX512())

17838

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

17839

DAG);

17840

17841

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

17842

Zeroable, Subtarget, DAG))

17843

return Blend;

17844

17845

// Check for being able to broadcast a single element.

17846

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

17847

Subtarget, DAG))

17848

return Broadcast;

17849

17850

// If the shuffle mask is repeated in each 128-bit lane we can use more

17851

// efficient instructions that mirror the shuffles across the two 128-bit

17852

// lanes.

17853

SmallVector<int, 4> RepeatedMask;

17854

bool Is128BitLaneRepeatedShuffle =

17855

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

17856

if (Is128BitLaneRepeatedShuffle) {

17857

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17857, __extension__
__PRETTY_FUNCTION__));

17858

if (V2.isUndef())

17859

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

17860

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

17861

17862

// Use dedicated unpack instructions for masks that match their pattern.

17863

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

17864

return V;

17865

}

17866

17867

// Try to use shift instructions.

17868

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,

17869

Zeroable, Subtarget, DAG))

17870

return Shift;

17871

17872

// If we have VLX support, we can use VALIGN or EXPAND.

17873

if (Subtarget.hasVLX()) {

17874

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

17875

Subtarget, DAG))

17876

return Rotate;

17877

17878

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

17879

DAG, Subtarget))

17880

return V;

17881

}

17882

17883

// Try to use byte rotation instructions.

17884

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

17885

Subtarget, DAG))

17886

return Rotate;

17887

17888

// Try to create an in-lane repeating shuffle mask and then shuffle the

17889

// results into the target lanes.

17890

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17891

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

17892

return V;

17893

17894

if (V2.isUndef()) {

17895

// Try to produce a fixed cross-128-bit lane permute followed by unpack

17896

// because that should be faster than the variable permute alternatives.

17897

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

17898

return V;

17899

17900

// If the shuffle patterns aren't repeated but it's a single input, directly

17901

// generate a cross-lane VPERMD instruction.

17902

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

17903

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

17904

}

17905

17906

// Assume that a single SHUFPS is faster than an alternative sequence of

17907

// multiple instructions (even if the CPU has a domain penalty).

17908

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

17909

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

17910

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

17911

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

17912

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

17913

CastV1, CastV2, DAG);

17914

return DAG.getBitcast(MVT::v8i32, ShufPS);

17915

}

17916

17917

// Try to simplify this by merging 128-bit lanes to enable a lane-based

17918

// shuffle.

17919

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

17920

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

17921

return Result;

17922

17923

// Otherwise fall back on generic blend lowering.

17924

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

17925

Subtarget, DAG);

17926

}

17927

17928

/// Handle lowering of 16-lane 16-bit integer shuffles.

17929

///

17930

/// This routine is only called when we have AVX2 and thus a reasonable

17931

/// instruction set for v16i16 shuffling..

17932

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

17933

const APInt &Zeroable, SDValue V1, SDValue V2,

17934

const X86Subtarget &Subtarget,

17935

SelectionDAG &DAG) {

17936

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17936, __extension__
__PRETTY_FUNCTION__));

17937

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17937, __extension__
__PRETTY_FUNCTION__));

17938

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17938, __extension__
__PRETTY_FUNCTION__));

17939

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17939, __extension__
__PRETTY_FUNCTION__));

17940

17941

// Whenever we can lower this as a zext, that instruction is strictly faster

17942

// than any alternative. It also allows us to fold memory operands into the

17943

// shuffle in many cases.

17944

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

17945

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

17946

return ZExt;

17947

17948

// Check for being able to broadcast a single element.

17949

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

17950

Subtarget, DAG))

17951

return Broadcast;

17952

17953

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

17954

Zeroable, Subtarget, DAG))

17955

return Blend;

17956

17957

// Use dedicated unpack instructions for masks that match their pattern.

17958

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

17959

return V;

17960

17961

// Use dedicated pack instructions for masks that match their pattern.

17962

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

17963

Subtarget))

17964

return V;

17965

17966

// Try to use lower using a truncation.

17967

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

17968

Subtarget, DAG))

17969

return V;

17970

17971

// Try to use shift instructions.

17972

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,

17973

Zeroable, Subtarget, DAG))

17974

return Shift;

17975

17976

// Try to use byte rotation instructions.

17977

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

17978

Subtarget, DAG))

17979

return Rotate;

17980

17981

// Try to create an in-lane repeating shuffle mask and then shuffle the

17982

// results into the target lanes.

17983

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

17984

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

17985

return V;

17986

17987

if (V2.isUndef()) {

17988

// Try to use bit rotation instructions.

17989

if (SDValue Rotate =

17990

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

17991

return Rotate;

17992

17993

// Try to produce a fixed cross-128-bit lane permute followed by unpack

17994

// because that should be faster than the variable permute alternatives.

17995

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

17996

return V;

17997

17998

// There are no generalized cross-lane shuffle operations available on i16

17999

// element types.

18000

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

18001

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18002

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18003

return V;

18004

18005

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

18006

DAG, Subtarget);

18007

}

18008

18009

SmallVector<int, 8> RepeatedMask;

18010

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

18011

// As this is a single-input shuffle, the repeated mask should be

18012

// a strictly valid v8i16 mask that we can pass through to the v8i16

18013

// lowering to handle even the v16 case.

18014

return lowerV8I16GeneralSingleInputShuffle(

18015

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

18016

}

18017

}

18018

18019

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

18020

Zeroable, Subtarget, DAG))

18021

return PSHUFB;

18022

18023

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

18024

if (Subtarget.hasBWI())

18025

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

18026

18027

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18028

// shuffle.

18029

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18030

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18031

return Result;

18032

18033

// Try to permute the lanes and then use a per-lane permute.

18034

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18035

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18036

return V;

18037

18038

// Otherwise fall back on generic lowering.

18039

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

18040

Subtarget, DAG);

18041

}

18042

18043

/// Handle lowering of 32-lane 8-bit integer shuffles.

18044

///

18045

/// This routine is only called when we have AVX2 and thus a reasonable

18046

/// instruction set for v32i8 shuffling..

18047

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18048

const APInt &Zeroable, SDValue V1, SDValue V2,

18049

const X86Subtarget &Subtarget,

18050

SelectionDAG &DAG) {

18051

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18051, __extension__
__PRETTY_FUNCTION__));

18052

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18052, __extension__
__PRETTY_FUNCTION__));

18053

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18053, __extension__
__PRETTY_FUNCTION__));

18054

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18054, __extension__
__PRETTY_FUNCTION__));

18055

18056

// Whenever we can lower this as a zext, that instruction is strictly faster

18057

// than any alternative. It also allows us to fold memory operands into the

18058

// shuffle in many cases.

18059

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

18060

Zeroable, Subtarget, DAG))

18061

return ZExt;

18062

18063

// Check for being able to broadcast a single element.

18064

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

18065

Subtarget, DAG))

18066

return Broadcast;

18067

18068

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

18069

Zeroable, Subtarget, DAG))

18070

return Blend;

18071

18072

// Use dedicated unpack instructions for masks that match their pattern.

18073

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

18074

return V;

18075

18076

// Use dedicated pack instructions for masks that match their pattern.

18077

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

18078

Subtarget))

18079

return V;

18080

18081

// Try to use lower using a truncation.

18082

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

18083

Subtarget, DAG))

18084

return V;

18085

18086

// Try to use shift instructions.

18087

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,

18088

Zeroable, Subtarget, DAG))

18089

return Shift;

18090

18091

// Try to use byte rotation instructions.

18092

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

18093

Subtarget, DAG))

18094

return Rotate;

18095

18096

// Try to use bit rotation instructions.

18097

if (V2.isUndef())

18098

if (SDValue Rotate =

18099

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

18100

return Rotate;

18101

18102

// Try to create an in-lane repeating shuffle mask and then shuffle the

18103

// results into the target lanes.

18104

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18105

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18106

return V;

18107

18108

// There are no generalized cross-lane shuffle operations available on i8

18109

// element types.

18110

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

18111

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18112

// because that should be faster than the variable permute alternatives.

18113

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

18114

return V;

18115

18116

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18117

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18118

return V;

18119

18120

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

18121

DAG, Subtarget);

18122

}

18123

18124

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

18125

Zeroable, Subtarget, DAG))

18126

return PSHUFB;

18127

18128

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

18129

if (Subtarget.hasVBMI())

18130

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

18131

18132

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18133

// shuffle.

18134

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18135

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18136

return Result;

18137

18138

// Try to permute the lanes and then use a per-lane permute.

18139

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18140

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18141

return V;

18142

18143

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18144

// by zeroable elements in the remaining 24 elements. Turn this into two

18145

// vmovqb instructions shuffled together.

18146

if (Subtarget.hasVLX())

18147

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

18148

Mask, Zeroable, DAG))

18149

return V;

18150

18151

// Otherwise fall back on generic lowering.

18152

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

18153

Subtarget, DAG);

18154

}

18155

18156

/// High-level routine to lower various 256-bit x86 vector shuffles.

18157

///

18158

/// This routine either breaks down the specific type of a 256-bit x86 vector

18159

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

18160

/// together based on the available instructions.

18161

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

18162

SDValue V1, SDValue V2, const APInt &Zeroable,

18163

const X86Subtarget &Subtarget,

18164

SelectionDAG &DAG) {

18165

// If we have a single input to the zero element, insert that into V1 if we

18166

// can do so cheaply.

18167

int NumElts = VT.getVectorNumElements();

18168

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

18169

18170

if (NumV2Elements == 1 && Mask[0] >= NumElts)

18171

if (SDValue Insertion = lowerShuffleAsElementInsertion(

18172

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

18173

return Insertion;

18174

18175

// Handle special cases where the lower or upper half is UNDEF.

18176

if (SDValue V =

18177

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

18178

return V;

18179

18180

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

18181

// can check for those subtargets here and avoid much of the subtarget

18182

// querying in the per-vector-type lowering routines. With AVX1 we have

18183

// essentially *zero* ability to manipulate a 256-bit vector with integer

18184

// types. Since we'll use floating point types there eventually, just

18185

// immediately cast everything to a float and operate entirely in that domain.

18186

if (VT.isInteger() && !Subtarget.hasAVX2()) {

18187

int ElementBits = VT.getScalarSizeInBits();

18188

if (ElementBits < 32) {

18189

// No floating point type available, if we can't use the bit operations

18190

// for masking/blending then decompose into 128-bit vectors.

18191

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

18192

Subtarget, DAG))

18193

return V;

18194

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

18195

return V;

18196

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

18197

}

18198

18199

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

18200

VT.getVectorNumElements());

18201

V1 = DAG.getBitcast(FpVT, V1);

18202

V2 = DAG.getBitcast(FpVT, V2);

18203

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

18204

}

18205

18206

if (VT == MVT::v16f16) {

18207

V1 = DAG.getBitcast(MVT::v16i16, V1);

18208

V2 = DAG.getBitcast(MVT::v16i16, V2);

18209

return DAG.getBitcast(MVT::v16f16,

18210

DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

18211

}

18212

18213

switch (VT.SimpleTy) {

18214

case MVT::v4f64:

18215

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18216

case MVT::v4i64:

18217

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18218

case MVT::v8f32:

18219

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18220

case MVT::v8i32:

18221

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18222

case MVT::v16i16:

18223

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18224

case MVT::v32i8:

18225

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18226

18227

default:

18228

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18228);

18229

}

18230

}

18231

18232

/// Try to lower a vector shuffle as a 128-bit shuffles.

18233

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

18234

const APInt &Zeroable, SDValue V1, SDValue V2,

18235

const X86Subtarget &Subtarget,

18236

SelectionDAG &DAG) {

18237

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18238, __extension__
__PRETTY_FUNCTION__))

18238

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18238, __extension__
__PRETTY_FUNCTION__));

18239

18240

// To handle 256 bit vector requires VLX and most probably

18241

// function lowerV2X128VectorShuffle() is better solution.

18242

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18242, __extension__
__PRETTY_FUNCTION__));

18243

18244

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

18245

SmallVector<int, 4> Widened128Mask;

18246

if (!canWidenShuffleElements(Mask, Widened128Mask))

18247

return SDValue();

18248

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18248, __extension__
__PRETTY_FUNCTION__));

18249

18250

// Try to use an insert into a zero vector.

18251

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

18252

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

18253

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

18254

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

18255

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

18256

DAG.getIntPtrConstant(0, DL));

18257

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

18258

getZeroVector(VT, Subtarget, DAG, DL), LoV,

18259

DAG.getIntPtrConstant(0, DL));

18260

}

18261

18262

// Check for patterns which can be matched with a single insert of a 256-bit

18263

// subvector.

18264

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

18265

if (OnlyUsesV1 ||

18266

isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

18267

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

18268

SDValue SubVec =

18269

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

18270

DAG.getIntPtrConstant(0, DL));

18271

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

18272

DAG.getIntPtrConstant(4, DL));

18273

}

18274

18275

// See if this is an insertion of the lower 128-bits of V2 into V1.

18276

bool IsInsert = true;

18277

int V2Index = -1;

18278

for (int i = 0; i < 4; ++i) {

18279

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18279, __extension__
__PRETTY_FUNCTION__));

18280

if (Widened128Mask[i] < 0)

18281

continue;

18282

18283

// Make sure all V1 subvectors are in place.

18284

if (Widened128Mask[i] < 4) {

18285

if (Widened128Mask[i] != i) {

18286

IsInsert = false;

18287

break;

18288

}

18289

} else {

18290

// Make sure we only have a single V2 index and its the lowest 128-bits.

18291

if (V2Index >= 0 || Widened128Mask[i] != 4) {

18292

IsInsert = false;

18293

break;

18294

}

18295

V2Index = i;

18296

}

18297

}

18298

if (IsInsert && V2Index >= 0) {

18299

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

18300

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

18301

DAG.getIntPtrConstant(0, DL));

18302

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

18303

}

18304

18305

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

18306

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

18307

// possible we at least ensure the lanes stay sequential to help later

18308

// combines.

18309

SmallVector<int, 2> Widened256Mask;

18310

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

18311

Widened128Mask.clear();

18312

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

18313

}

18314

18315

// Try to lower to vshuf64x2/vshuf32x4.

18316

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

18317

unsigned PermMask = 0;

18318

// Insure elements came from the same Op.

18319

for (int i = 0; i < 4; ++i) {

18320

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18320, __extension__
__PRETTY_FUNCTION__));

18321

if (Widened128Mask[i] < 0)

18322

continue;

18323

18324

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

18325

unsigned OpIndex = i / 2;

18326

if (Ops[OpIndex].isUndef())

18327

Ops[OpIndex] = Op;

18328

else if (Ops[OpIndex] != Op)

18329

return SDValue();

18330

18331

// Convert the 128-bit shuffle mask selection values into 128-bit selection

18332

// bits defined by a vshuf64x2 instruction's immediate control byte.

18333

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

18334

}

18335

18336

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

18337

DAG.getTargetConstant(PermMask, DL, MVT::i8));

18338

}

18339

18340

/// Handle lowering of 8-lane 64-bit floating point shuffles.

18341

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18342

const APInt &Zeroable, SDValue V1, SDValue V2,

18343

const X86Subtarget &Subtarget,

18344

SelectionDAG &DAG) {

18345

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18345, __extension__
__PRETTY_FUNCTION__));

18346

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18346, __extension__
__PRETTY_FUNCTION__));

18347

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18347, __extension__
__PRETTY_FUNCTION__));

18348

18349

if (V2.isUndef()) {

18350

// Use low duplicate instructions for masks that match their pattern.

18351

if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

18352

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

18353

18354

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

18355

// Non-half-crossing single input shuffles can be lowered with an

18356

// interleaved permutation.

18357

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18358

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

18359

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

18360

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

18361

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

18362

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18363

}

18364

18365

SmallVector<int, 4> RepeatedMask;

18366

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

18367

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

18368

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18369

}

18370

18371

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

18372

V2, Subtarget, DAG))

18373

return Shuf128;

18374

18375

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

18376

return Unpck;

18377

18378

// Check if the blend happens to exactly fit that of SHUFPD.

18379

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

18380

Zeroable, Subtarget, DAG))

18381

return Op;

18382

18383

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

18384

DAG, Subtarget))

18385

return V;

18386

18387

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

18388

Zeroable, Subtarget, DAG))

18389

return Blend;

18390

18391

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

18392

}

18393

18394

/// Handle lowering of 16-lane 32-bit floating point shuffles.

18395

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18396

const APInt &Zeroable, SDValue V1, SDValue V2,

18397

const X86Subtarget &Subtarget,

18398

SelectionDAG &DAG) {

18399

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18399, __extension__
__PRETTY_FUNCTION__));

18400

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18400, __extension__
__PRETTY_FUNCTION__));

18401

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__
__PRETTY_FUNCTION__));

18402

18403

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18404

// options to efficiently lower the shuffle.

18405

SmallVector<int, 4> RepeatedMask;

18406

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

18407

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18407, __extension__
__PRETTY_FUNCTION__));

18408

18409

// Use even/odd duplicate instructions for masks that match their pattern.

18410

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18411

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

18412

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18413

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

18414

18415

if (V2.isUndef())

18416

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

18417

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18418

18419

// Use dedicated unpack instructions for masks that match their pattern.

18420

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

18421

return V;

18422

18423

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

18424

Zeroable, Subtarget, DAG))

18425

return Blend;

18426

18427

// Otherwise, fall back to a SHUFPS sequence.

18428

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

18429

}

18430

18431

// Try to create an in-lane repeating shuffle mask and then shuffle the

18432

// results into the target lanes.

18433

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18434

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

18435

return V;

18436

18437

// If we have a single input shuffle with different shuffle patterns in the

18438

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

18439

if (V2.isUndef() &&

18440

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

18441

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

18442

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

18443

}

18444

18445

// If we have AVX512F support, we can use VEXPAND.

18446

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

18447

V1, V2, DAG, Subtarget))

18448

return V;

18449

18450

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

18451

}

18452

18453

/// Handle lowering of 8-lane 64-bit integer shuffles.

18454

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18455

const APInt &Zeroable, SDValue V1, SDValue V2,

18456

const X86Subtarget &Subtarget,

18457

SelectionDAG &DAG) {

18458

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18458, __extension__
__PRETTY_FUNCTION__));

18459

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18459, __extension__
__PRETTY_FUNCTION__));

18460

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18460, __extension__
__PRETTY_FUNCTION__));

18461

18462

if (V2.isUndef()) {

18463

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

18464

// can use lower latency instructions that will operate on all four

18465

// 128-bit lanes.

18466

SmallVector<int, 2> Repeated128Mask;

18467

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

18468

SmallVector<int, 4> PSHUFDMask;

18469

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

18470

return DAG.getBitcast(

18471

MVT::v8i64,

18472

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

18473

DAG.getBitcast(MVT::v16i32, V1),

18474

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

18475

}

18476

18477

SmallVector<int, 4> Repeated256Mask;

18478

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

18479

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

18480

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

18481

}

18482

18483

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

18484

V2, Subtarget, DAG))

18485

return Shuf128;

18486

18487

// Try to use shift instructions.

18488

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,

18489

Zeroable, Subtarget, DAG))

18490

return Shift;

18491

18492

// Try to use VALIGN.

18493

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

18494

Subtarget, DAG))

18495

return Rotate;

18496

18497

// Try to use PALIGNR.

18498

if (Subtarget.hasBWI())

18499

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

18500

Subtarget, DAG))

18501

return Rotate;

18502

18503

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

18504

return Unpck;

18505

18506

// If we have AVX512F support, we can use VEXPAND.

18507

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

18508

DAG, Subtarget))

18509

return V;

18510

18511

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

18512

Zeroable, Subtarget, DAG))

18513

return Blend;

18514

18515

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

18516

}

18517

18518

/// Handle lowering of 16-lane 32-bit integer shuffles.

18519

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18520

const APInt &Zeroable, SDValue V1, SDValue V2,

18521

const X86Subtarget &Subtarget,

18522

SelectionDAG &DAG) {

18523

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18523, __extension__
__PRETTY_FUNCTION__));

18524

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18524, __extension__
__PRETTY_FUNCTION__));

18525

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18525, __extension__
__PRETTY_FUNCTION__));

18526

18527

// Whenever we can lower this as a zext, that instruction is strictly faster

18528

// than any alternative. It also allows us to fold memory operands into the

18529

// shuffle in many cases.

18530

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18531

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

18532

return ZExt;

18533

18534

// If the shuffle mask is repeated in each 128-bit lane we can use more

18535

// efficient instructions that mirror the shuffles across the four 128-bit

18536

// lanes.

18537

SmallVector<int, 4> RepeatedMask;

18538

bool Is128BitLaneRepeatedShuffle =

18539

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

18540

if (Is128BitLaneRepeatedShuffle) {

18541

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18541, __extension__
__PRETTY_FUNCTION__));

18542

if (V2.isUndef())

18543

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

18544

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18545

18546

// Use dedicated unpack instructions for masks that match their pattern.

18547

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

18548

return V;

18549

}

18550

18551

// Try to use shift instructions.

18552

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,

18553

Zeroable, Subtarget, DAG))

18554

return Shift;

18555

18556

// Try to use VALIGN.

18557

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

18558

Subtarget, DAG))

18559

return Rotate;

18560

18561

// Try to use byte rotation instructions.

18562

if (Subtarget.hasBWI())

18563

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

18564

Subtarget, DAG))

18565

return Rotate;

18566

18567

// Assume that a single SHUFPS is faster than using a permv shuffle.

18568

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

18569

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

18570

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

18571

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

18572

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

18573

CastV1, CastV2, DAG);

18574

return DAG.getBitcast(MVT::v16i32, ShufPS);

18575

}

18576

18577

// Try to create an in-lane repeating shuffle mask and then shuffle the

18578

// results into the target lanes.

18579

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18580

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

18581

return V;

18582

18583

// If we have AVX512F support, we can use VEXPAND.

18584

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

18585

DAG, Subtarget))

18586

return V;

18587

18588

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

18589

Zeroable, Subtarget, DAG))

18590

return Blend;

18591

18592

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

18593

}

18594

18595

/// Handle lowering of 32-lane 16-bit integer shuffles.

18596

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18597

const APInt &Zeroable, SDValue V1, SDValue V2,

18598

const X86Subtarget &Subtarget,

18599

SelectionDAG &DAG) {

18600

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18600, __extension__
__PRETTY_FUNCTION__));

18601

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18601, __extension__
__PRETTY_FUNCTION__));

18602

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18602, __extension__
__PRETTY_FUNCTION__));

18603

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18603, __extension__
__PRETTY_FUNCTION__));

18604

18605

// Whenever we can lower this as a zext, that instruction is strictly faster

18606

// than any alternative. It also allows us to fold memory operands into the

18607

// shuffle in many cases.

18608

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18609

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

18610

return ZExt;

18611

18612

// Use dedicated unpack instructions for masks that match their pattern.

18613

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

18614

return V;

18615

18616

// Use dedicated pack instructions for masks that match their pattern.

18617

if (SDValue V =

18618

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

18619

return V;

18620

18621

// Try to use shift instructions.

18622

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,

18623

Zeroable, Subtarget, DAG))

18624

return Shift;

18625

18626

// Try to use byte rotation instructions.

18627

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

18628

Subtarget, DAG))

18629

return Rotate;

18630

18631

if (V2.isUndef()) {

18632

// Try to use bit rotation instructions.

18633

if (SDValue Rotate =

18634

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

18635

return Rotate;

18636

18637

SmallVector<int, 8> RepeatedMask;

18638

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

18639

// As this is a single-input shuffle, the repeated mask should be

18640

// a strictly valid v8i16 mask that we can pass through to the v8i16

18641

// lowering to handle even the v32 case.

18642

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

18643

RepeatedMask, Subtarget, DAG);

18644

}

18645

}

18646

18647

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

18648

Zeroable, Subtarget, DAG))

18649

return Blend;

18650

18651

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

18652

Zeroable, Subtarget, DAG))

18653

return PSHUFB;

18654

18655

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

18656

}

18657

18658

/// Handle lowering of 64-lane 8-bit integer shuffles.

18659

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18660

const APInt &Zeroable, SDValue V1, SDValue V2,

18661

const X86Subtarget &Subtarget,

18662

SelectionDAG &DAG) {

18663

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18663, __extension__
__PRETTY_FUNCTION__));

18664

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18664, __extension__
__PRETTY_FUNCTION__));

18665

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18665, __extension__
__PRETTY_FUNCTION__));

18666

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18666, __extension__
__PRETTY_FUNCTION__));

18667

18668

// Whenever we can lower this as a zext, that instruction is strictly faster

18669

// than any alternative. It also allows us to fold memory operands into the

18670

// shuffle in many cases.

18671

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18672

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

18673

return ZExt;

18674

18675

// Use dedicated unpack instructions for masks that match their pattern.

18676

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

18677

return V;

18678

18679

// Use dedicated pack instructions for masks that match their pattern.

18680

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

18681

Subtarget))

18682

return V;

18683

18684

// Try to use shift instructions.

18685

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,

18686

Zeroable, Subtarget, DAG))

18687

return Shift;

18688

18689

// Try to use byte rotation instructions.

18690

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

18691

Subtarget, DAG))

18692

return Rotate;

18693

18694

// Try to use bit rotation instructions.

18695

if (V2.isUndef())

18696

if (SDValue Rotate =

18697

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

18698

return Rotate;

18699

18700

// Lower as AND if possible.

18701

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

18702

Zeroable, Subtarget, DAG))

18703

return Masked;

18704

18705

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

18706

Zeroable, Subtarget, DAG))

18707

return PSHUFB;

18708

18709

// Try to create an in-lane repeating shuffle mask and then shuffle the

18710

// results into the target lanes.

18711

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18712

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

18713

return V;

18714

18715

if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

18716

DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

18717

return Result;

18718

18719

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

18720

Zeroable, Subtarget, DAG))

18721

return Blend;

18722

18723

if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

18724

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

18725

// PALIGNR will be cheaper than the second PSHUFB+OR.

18726

if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

18727

Mask, Subtarget, DAG))

18728

return V;

18729

18730

// If we can't directly blend but can use PSHUFB, that will be better as it

18731

// can both shuffle and set up the inefficient blend.

18732

bool V1InUse, V2InUse;

18733

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

18734

DAG, V1InUse, V2InUse);

18735

}

18736

18737

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18738

// shuffle.

18739

if (!V2.isUndef())

18740

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18741

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

18742

return Result;

18743

18744

// VBMI can use VPERMV/VPERMV3 byte shuffles.

18745

if (Subtarget.hasVBMI())

18746

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

18747

18748

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);

18749

}

18750

18751

/// High-level routine to lower various 512-bit x86 vector shuffles.

18752

///

18753

/// This routine either breaks down the specific type of a 512-bit x86 vector

18754

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

18755

/// together based on the available instructions.

18756

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

18757

MVT VT, SDValue V1, SDValue V2,

18758

const APInt &Zeroable,

18759

const X86Subtarget &Subtarget,

18760

SelectionDAG &DAG) {

18761

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18762, __extension__
__PRETTY_FUNCTION__))

18762

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18762, __extension__
__PRETTY_FUNCTION__));

18763

18764

// If we have a single input to the zero element, insert that into V1 if we

18765

// can do so cheaply.

18766

int NumElts = Mask.size();

18767

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

18768

18769

if (NumV2Elements == 1 && Mask[0] >= NumElts)

18770

if (SDValue Insertion = lowerShuffleAsElementInsertion(

18771

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

18772

return Insertion;

18773

18774

// Handle special cases where the lower or upper half is UNDEF.

18775

if (SDValue V =

18776

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

18777

return V;

18778

18779

// Check for being able to broadcast a single element.

18780

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

18781

Subtarget, DAG))

18782

return Broadcast;

18783

18784

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

18785

// Try using bit ops for masking and blending before falling back to

18786

// splitting.

18787

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

18788

Subtarget, DAG))

18789

return V;

18790

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

18791

return V;

18792

18793

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

18794

}

18795

18796

if (VT == MVT::v32f16) {

18797

V1 = DAG.getBitcast(MVT::v32i16, V1);

18798

V2 = DAG.getBitcast(MVT::v32i16, V2);

18799

return DAG.getBitcast(MVT::v32f16,

18800

DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

18801

}

18802

18803

// Dispatch to each element type for lowering. If we don't have support for

18804

// specific element type shuffles at 512 bits, immediately split them and

18805

// lower them. Each lowering routine of a given type is allowed to assume that

18806

// the requisite ISA extensions for that element type are available.

18807

switch (VT.SimpleTy) {

18808

case MVT::v8f64:

18809

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18810

case MVT::v16f32:

18811

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18812

case MVT::v8i64:

18813

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18814

case MVT::v16i32:

18815

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18816

case MVT::v32i16:

18817

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18818

case MVT::v64i8:

18819

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18820

18821

default:

18822

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822);

18823

}

18824

}

18825

18826

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

18827

MVT VT, SDValue V1, SDValue V2,

18828

const X86Subtarget &Subtarget,

18829

SelectionDAG &DAG) {

18830

// Shuffle should be unary.

18831

if (!V2.isUndef())

18832

return SDValue();

18833

18834

int ShiftAmt = -1;

18835

int NumElts = Mask.size();

18836

for (int i = 0; i != NumElts; ++i) {

18837

int M = Mask[i];

18838

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__))

18839

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__));

18840

if (M < 0)

18841

continue;

18842

18843

// The first non-undef element determines our shift amount.

18844

if (ShiftAmt < 0) {

18845

ShiftAmt = M - i;

18846

// Need to be shifting right.

18847

if (ShiftAmt <= 0)

18848

return SDValue();

18849

}

18850

// All non-undef elements must shift by the same amount.

18851

if (ShiftAmt != M - i)

18852

return SDValue();

18853

}

18854

assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18854, __extension__
__PRETTY_FUNCTION__));

18855

18856

// Great we found a shift right.

18857

MVT WideVT = VT;

18858

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

18859

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

18860

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

18861

DAG.getUNDEF(WideVT), V1,

18862

DAG.getIntPtrConstant(0, DL));

18863

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

18864

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

18865

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

18866

DAG.getIntPtrConstant(0, DL));

18867

}

18868

18869

// Determine if this shuffle can be implemented with a KSHIFT instruction.

18870

// Returns the shift amount if possible or -1 if not. This is a simplified

18871

// version of matchShuffleAsShift.

18872

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

18873

int MaskOffset, const APInt &Zeroable) {

18874

int Size = Mask.size();

18875

18876

auto CheckZeros = [&](int Shift, bool Left) {

18877

for (int j = 0; j < Shift; ++j)

18878

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

18879

return false;

18880

18881

return true;

18882

};

18883

18884

auto MatchShift = [&](int Shift, bool Left) {

18885

unsigned Pos = Left ? Shift : 0;

18886

unsigned Low = Left ? 0 : Shift;

18887

unsigned Len = Size - Shift;

18888

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

18889

};

18890

18891

for (int Shift = 1; Shift != Size; ++Shift)

18892

for (bool Left : {true, false})

18893

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

18894

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

18895

return Shift;

18896

}

18897

18898

return -1;

18899

}

18900

18901

18902

// Lower vXi1 vector shuffles.

18903

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

18904

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

18905

// vector, shuffle and then truncate it back.

18906

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

18907

MVT VT, SDValue V1, SDValue V2,

18908

const APInt &Zeroable,

18909

const X86Subtarget &Subtarget,

18910

SelectionDAG &DAG) {

18911

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18912, __extension__
__PRETTY_FUNCTION__))

18912

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18912, __extension__
__PRETTY_FUNCTION__));

18913

18914

int NumElts = Mask.size();

18915

18916

// Try to recognize shuffles that are just padding a subvector with zeros.

18917

int SubvecElts = 0;

18918

int Src = -1;

18919

for (int i = 0; i != NumElts; ++i) {

18920

if (Mask[i] >= 0) {

18921

// Grab the source from the first valid mask. All subsequent elements need

18922

// to use this same source.

18923

if (Src < 0)

18924

Src = Mask[i] / NumElts;

18925

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

18926

break;

18927

}

18928

18929

++SubvecElts;

18930

}

18931

assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18931, __extension__
__PRETTY_FUNCTION__));

18932

18933

// Clip to a power 2.

18934

SubvecElts = PowerOf2Floor(SubvecElts);

18935

18936

// Make sure the number of zeroable bits in the top at least covers the bits

18937

// not covered by the subvector.

18938

if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {

18939

assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18939, __extension__
__PRETTY_FUNCTION__));

18940

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

18941

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

18942

Src == 0 ? V1 : V2,

18943

DAG.getIntPtrConstant(0, DL));

18944

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

18945

DAG.getConstant(0, DL, VT),

18946

Extract, DAG.getIntPtrConstant(0, DL));

18947

}

18948

18949

// Try a simple shift right with undef elements. Later we'll try with zeros.

18950

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

18951

DAG))

18952

return Shift;

18953

18954

// Try to match KSHIFTs.

18955

unsigned Offset = 0;

18956

for (SDValue V : { V1, V2 }) {

18957

unsigned Opcode;

18958

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

18959

if (ShiftAmt >= 0) {

18960

MVT WideVT = VT;

18961

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

18962

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

18963

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

18964

DAG.getUNDEF(WideVT), V,

18965

DAG.getIntPtrConstant(0, DL));

18966

// Widened right shifts need two shifts to ensure we shift in zeroes.

18967

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

18968

int WideElts = WideVT.getVectorNumElements();

18969

// Shift left to put the original vector in the MSBs of the new size.

18970

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

18971

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

18972

// Increase the shift amount to account for the left shift.

18973

ShiftAmt += WideElts - NumElts;

18974

}

18975

18976

Res = DAG.getNode(Opcode, DL, WideVT, Res,

18977

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

18978

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

18979

DAG.getIntPtrConstant(0, DL));

18980

}

18981

Offset += NumElts; // Increment for next iteration.

18982

}

18983

18984

// If we're broadcasting a SETCC result, try to broadcast the ops instead.

18985

// TODO: What other unary shuffles would benefit from this?

18986

if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&

18987

V1->hasOneUse()) {

18988

SDValue Op0 = V1.getOperand(0);

18989

SDValue Op1 = V1.getOperand(1);

18990

ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

18991

EVT OpVT = Op0.getValueType();

18992

return DAG.getSetCC(

18993

DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

18994

DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

18995

}

18996

18997

MVT ExtVT;

18998

switch (VT.SimpleTy) {

18999

default:

19000

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000);

19001

case MVT::v2i1:

19002

ExtVT = MVT::v2i64;

19003

break;

19004

case MVT::v4i1:

19005

ExtVT = MVT::v4i32;

19006

break;

19007

case MVT::v8i1:

19008

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

19009

// shuffle.

19010

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

19011

break;

19012

case MVT::v16i1:

19013

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19014

// 256-bit operation available.

19015

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

19016

break;

19017

case MVT::v32i1:

19018

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19019

// 256-bit operation available.

19020

assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19020, __extension__
__PRETTY_FUNCTION__));

19021

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

19022

break;

19023

case MVT::v64i1:

19024

// Fall back to scalarization. FIXME: We can do better if the shuffle

19025

// can be partitioned cleanly.

19026

if (!Subtarget.useBWIRegs())

19027

return SDValue();

19028

ExtVT = MVT::v64i8;

19029

break;

19030

}

19031

19032

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

19033

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

19034

19035

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

19036

// i1 was sign extended we can use X86ISD::CVT2MASK.

19037

int NumElems = VT.getVectorNumElements();

19038

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

19039

(Subtarget.hasDQI() && (NumElems < 32)))

19040

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

19041

Shuffle, ISD::SETGT);

19042

19043

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

19044

}

19045

19046

/// Helper function that returns true if the shuffle mask should be

19047

/// commuted to improve canonicalization.

19048

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

19049

int NumElements = Mask.size();

19050

19051

int NumV1Elements = 0, NumV2Elements = 0;

19052

for (int M : Mask)

19053

if (M < 0)

19054

continue;

19055

else if (M < NumElements)

19056

++NumV1Elements;

19057

else

19058

++NumV2Elements;

19059

19060

// Commute the shuffle as needed such that more elements come from V1 than

19061

// V2. This allows us to match the shuffle pattern strictly on how many

19062

// elements come from V1 without handling the symmetric cases.

19063

if (NumV2Elements > NumV1Elements)

19064

return true;

19065

19066

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19066, __extension__
__PRETTY_FUNCTION__));

19067

19068

if (NumV2Elements == 0)

19069

return false;

19070

19071

// When the number of V1 and V2 elements are the same, try to minimize the

19072

// number of uses of V2 in the low half of the vector. When that is tied,

19073

// ensure that the sum of indices for V1 is equal to or lower than the sum

19074

// indices for V2. When those are equal, try to ensure that the number of odd

19075

// indices for V1 is lower than the number of odd indices for V2.

19076

if (NumV1Elements == NumV2Elements) {

19077

int LowV1Elements = 0, LowV2Elements = 0;

19078

for (int M : Mask.slice(0, NumElements / 2))

19079

if (M >= NumElements)

19080

++LowV2Elements;

19081

else if (M >= 0)

19082

++LowV1Elements;

19083

if (LowV2Elements > LowV1Elements)

19084

return true;

19085

if (LowV2Elements == LowV1Elements) {

19086

int SumV1Indices = 0, SumV2Indices = 0;

19087

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19088

if (Mask[i] >= NumElements)

19089

SumV2Indices += i;

19090

else if (Mask[i] >= 0)

19091

SumV1Indices += i;

19092

if (SumV2Indices < SumV1Indices)

19093

return true;

19094

if (SumV2Indices == SumV1Indices) {

19095

int NumV1OddIndices = 0, NumV2OddIndices = 0;

19096

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19097

if (Mask[i] >= NumElements)

19098

NumV2OddIndices += i % 2;

19099

else if (Mask[i] >= 0)

19100

NumV1OddIndices += i % 2;

19101

if (NumV2OddIndices < NumV1OddIndices)

19102

return true;

19103

}

19104

}

19105

}

19106

19107

return false;

19108

}

19109

19110

// Forward declaration.

19111

static SDValue canonicalizeShuffleMaskWithHorizOp(

19112

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

19113

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

19114

const X86Subtarget &Subtarget);

19115

19116

/// Top-level lowering for x86 vector shuffles.

19117

///

19118

/// This handles decomposition, canonicalization, and lowering of all x86

19119

/// vector shuffles. Most of the specific lowering strategies are encapsulated

19120

/// above in helper routines. The canonicalization attempts to widen shuffles

19121

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

19122

/// s.t. only one of the two inputs needs to be tested, etc.

19123

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

19124

SelectionDAG &DAG) {

19125

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

19126

ArrayRef<int> OrigMask = SVOp->getMask();

19127

SDValue V1 = Op.getOperand(0);

19128

SDValue V2 = Op.getOperand(1);

19129

MVT VT = Op.getSimpleValueType();

19130

int NumElements = VT.getVectorNumElements();

19131

SDLoc DL(Op);

19132

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

19133

19134

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19135, __extension__
__PRETTY_FUNCTION__))

19135

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19135, __extension__
__PRETTY_FUNCTION__));

19136

19137

bool V1IsUndef = V1.isUndef();

19138

bool V2IsUndef = V2.isUndef();

19139

if (V1IsUndef && V2IsUndef)

19140

return DAG.getUNDEF(VT);

19141

19142

// When we create a shuffle node we put the UNDEF node to second operand,

19143

// but in some cases the first operand may be transformed to UNDEF.

19144

// In this case we should just commute the node.

19145

if (V1IsUndef)

19146

return DAG.getCommutedVectorShuffle(*SVOp);

19147

19148

// Check for non-undef masks pointing at an undef vector and make the masks

19149

// undef as well. This makes it easier to match the shuffle based solely on

19150

// the mask.

19151

if (V2IsUndef &&

19152

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

19153

SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());

19154

for (int &M : NewMask)

19155

if (M >= NumElements)

19156

M = -1;

19157

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

19158

}

19159

19160

// Check for illegal shuffle mask element index values.

19161

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

19162

(void)MaskUpperLimit;

19163

assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))

19164

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__))

19165

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19165, __extension__
__PRETTY_FUNCTION__));

19166

19167

// We actually see shuffles that are entirely re-arrangements of a set of

19168

// zero inputs. This mostly happens while decomposing complex shuffles into

19169

// simple ones. Directly lower these as a buildvector of zeros.

19170

APInt KnownUndef, KnownZero;

19171

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

19172

19173

APInt Zeroable = KnownUndef | KnownZero;

19174

if (Zeroable.isAllOnes())

19175

return getZeroVector(VT, Subtarget, DAG, DL);

19176

19177

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

19178

19179

// Try to collapse shuffles into using a vector type with fewer elements but

19180

// wider element types. We cap this to not form integers or floating point

19181

// elements wider than 64 bits. It does not seem beneficial to form i128

19182

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

19183

SmallVector<int, 16> WidenedMask;

19184

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

19185

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

19186

// Shuffle mask widening should not interfere with a broadcast opportunity

19187

// by obfuscating the operands with bitcasts.

19188

// TODO: Avoid lowering directly from this top-level function: make this

19189

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

19190

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

19191

Subtarget, DAG))

19192

return Broadcast;

19193

19194

MVT NewEltVT = VT.isFloatingPoint()

19195

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

19196

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

19197

int NewNumElts = NumElements / 2;

19198

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

19199

// Make sure that the new vector type is legal. For example, v2f64 isn't

19200

// legal on SSE1.

19201

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

19202

if (V2IsZero) {

19203

// Modify the new Mask to take all zeros from the all-zero vector.

19204

// Choose indices that are blend-friendly.

19205

bool UsedZeroVector = false;

19206

assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19207, __extension__
__PRETTY_FUNCTION__))

19207

"V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19207, __extension__
__PRETTY_FUNCTION__));

19208

for (int i = 0; i != NewNumElts; ++i)

19209

if (WidenedMask[i] == SM_SentinelZero) {

19210

WidenedMask[i] = i + NewNumElts;

19211

UsedZeroVector = true;

19212

}

19213

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

19214

// some elements to be undef.

19215

if (UsedZeroVector)

19216

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

19217

}

19218

V1 = DAG.getBitcast(NewVT, V1);

19219

V2 = DAG.getBitcast(NewVT, V2);

19220

return DAG.getBitcast(

19221

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

19222

}

19223

}

19224

19225

SmallVector<SDValue> Ops = {V1, V2};

19226

SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());

19227

19228

// Canonicalize the shuffle with any horizontal ops inputs.

19229

// NOTE: This may update Ops and Mask.

19230

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

19231

Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

19232

return DAG.getBitcast(VT, HOp);

19233

19234

V1 = DAG.getBitcast(VT, Ops[0]);

19235

V2 = DAG.getBitcast(VT, Ops[1]);

19236

assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))

19237

"canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__))

19238

"shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19238, __extension__
__PRETTY_FUNCTION__));

19239

19240

// Commute the shuffle if it will improve canonicalization.

19241

if (canonicalizeShuffleMaskWithCommute(Mask)) {

19242

ShuffleVectorSDNode::commuteMask(Mask);

19243

std::swap(V1, V2);

19244

}

19245

19246

// For each vector width, delegate to a specialized lowering routine.

19247

if (VT.is128BitVector())

19248

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19249

19250

if (VT.is256BitVector())

19251

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19252

19253

if (VT.is512BitVector())

19254

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19255

19256

if (Is1BitVector)

19257

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19258

19259

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19259);

19260

}

19261

19262

/// Try to lower a VSELECT instruction to a vector shuffle.

19263

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

19264

const X86Subtarget &Subtarget,

19265

SelectionDAG &DAG) {

19266

SDValue Cond = Op.getOperand(0);

19267

SDValue LHS = Op.getOperand(1);

19268

SDValue RHS = Op.getOperand(2);

19269

MVT VT = Op.getSimpleValueType();

19270

19271

// Only non-legal VSELECTs reach this lowering, convert those into generic

19272

// shuffles and re-use the shuffle lowering path for blends.

19273

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

19274

SmallVector<int, 32> Mask;

19275

if (createShuffleMaskFromVSELECT(Mask, Cond))

19276

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

19277

}

19278

19279

return SDValue();

19280

}

19281

19282

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

19283

SDValue Cond = Op.getOperand(0);

19284

SDValue LHS = Op.getOperand(1);

19285

SDValue RHS = Op.getOperand(2);

19286

19287

// A vselect where all conditions and data are constants can be optimized into

19288

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

19289

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

19290

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

19291

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

19292

return SDValue();

19293

19294

// Try to lower this to a blend-style vector shuffle. This can handle all

19295

// constant condition cases.

19296

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

19297

return BlendOp;

19298

19299

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

19300

// with patterns on the mask registers on AVX-512.

19301

MVT CondVT = Cond.getSimpleValueType();

19302

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

19303

if (CondEltSize == 1)

19304

return Op;

19305

19306

// Variable blends are only legal from SSE4.1 onward.

19307

if (!Subtarget.hasSSE41())

19308

return SDValue();

19309

19310

SDLoc dl(Op);

19311

MVT VT = Op.getSimpleValueType();

19312

unsigned EltSize = VT.getScalarSizeInBits();

19313

unsigned NumElts = VT.getVectorNumElements();

19314

19315

// Expand v32i16/v64i8 without BWI.

19316

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

19317

return SDValue();

19318

19319

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

19320

// into an i1 condition so that we can use the mask-based 512-bit blend

19321

// instructions.

19322

if (VT.getSizeInBits() == 512) {

19323

// Build a mask by testing the condition against zero.

19324

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

19325

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

19326

DAG.getConstant(0, dl, CondVT),

19327

ISD::SETNE);

19328

// Now return a new VSELECT using the mask.

19329

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

19330

}

19331

19332

// SEXT/TRUNC cases where the mask doesn't match the destination size.

19333

if (CondEltSize != EltSize) {

19334

// If we don't have a sign splat, rely on the expansion.

19335

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

19336

return SDValue();

19337

19338

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

19339

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

19340

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

19341

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

19342

}

19343

19344

// Only some types will be legal on some subtargets. If we can emit a legal

19345

// VSELECT-matching blend, return Op, and but if we need to expand, return

19346

// a null value.

19347

switch (VT.SimpleTy) {

19348

default:

19349

// Most of the vector types have blends past SSE4.1.

19350

return Op;

19351

19352

case MVT::v32i8:

19353

// The byte blends for AVX vectors were introduced only in AVX2.

19354

if (Subtarget.hasAVX2())

19355

return Op;

19356

19357

return SDValue();

19358

19359

case MVT::v8i16:

19360

case MVT::v16i16: {

19361

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

19362

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

19363

Cond = DAG.getBitcast(CastVT, Cond);

19364

LHS = DAG.getBitcast(CastVT, LHS);

19365

RHS = DAG.getBitcast(CastVT, RHS);

19366

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

19367

return DAG.getBitcast(VT, Select);

19368

}

19369

}

19370

}

19371

19372

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

19373

MVT VT = Op.getSimpleValueType();

19374

SDValue Vec = Op.getOperand(0);

19375

SDValue Idx = Op.getOperand(1);

19376

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19376, __extension__
__PRETTY_FUNCTION__));

19377

SDLoc dl(Op);

19378

19379

if (!Vec.getSimpleValueType().is128BitVector())

19380

return SDValue();

19381

19382

if (VT.getSizeInBits() == 8) {

19383

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

19384

// we're going to zero extend the register or fold the store.

19385

if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

19386

!X86::mayFoldIntoStore(Op))

19387

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

19388

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

19389

DAG.getBitcast(MVT::v4i32, Vec), Idx));

19390

19391

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

19392

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

19393

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

19394

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

19395

}

19396

19397

if (VT == MVT::f32) {

19398

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

19399

// the result back to FR32 register. It's only worth matching if the

19400

// result has a single use which is a store or a bitcast to i32. And in

19401

// the case of a store, it's not worth it if the index is a constant 0,

19402

// because a MOVSSmr can be used instead, which is smaller and faster.

19403

if (!Op.hasOneUse())

19404

return SDValue();

19405

SDNode *User = *Op.getNode()->use_begin();

19406

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

19407

(User->getOpcode() != ISD::BITCAST ||

19408

User->getValueType(0) != MVT::i32))

19409

return SDValue();

19410

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

19411

DAG.getBitcast(MVT::v4i32, Vec), Idx);

19412

return DAG.getBitcast(MVT::f32, Extract);

19413

}

19414

19415

if (VT == MVT::i32 || VT == MVT::i64)

19416

return Op;

19417

19418

return SDValue();

19419

}

19420

19421

/// Extract one bit from mask vector, like v16i1 or v8i1.

19422

/// AVX-512 feature.

19423

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

19424

const X86Subtarget &Subtarget) {

19425

SDValue Vec = Op.getOperand(0);

19426

SDLoc dl(Vec);

19427

MVT VecVT = Vec.getSimpleValueType();

19428

SDValue Idx = Op.getOperand(1);

19429

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

19430

MVT EltVT = Op.getSimpleValueType();

19431

19432

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19433, __extension__
__PRETTY_FUNCTION__))

19433

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19433, __extension__
__PRETTY_FUNCTION__));

19434

19435

// variable index can't be handled in mask registers,

19436

// extend vector to VR512/128

19437

if (!IdxC) {

19438

unsigned NumElts = VecVT.getVectorNumElements();

19439

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

19440

// than extending to 128/256bit.

19441

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

19442

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

19443

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

19444

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

19445

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

19446

}

19447

19448

unsigned IdxVal = IdxC->getZExtValue();

19449

if (IdxVal == 0) // the operation is legal

19450

return Op;

19451

19452

// Extend to natively supported kshift.

19453

unsigned NumElems = VecVT.getVectorNumElements();

19454

MVT WideVecVT = VecVT;

19455

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

19456

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19457

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

19458

DAG.getUNDEF(WideVecVT), Vec,

19459

DAG.getIntPtrConstant(0, dl));

19460

}

19461

19462

// Use kshiftr instruction to move to the lower element.

19463

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

19464

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

19465

19466

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

19467

DAG.getIntPtrConstant(0, dl));

19468

}

19469

19470

SDValue

19471

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

19472

SelectionDAG &DAG) const {

19473

SDLoc dl(Op);

19474

SDValue Vec = Op.getOperand(0);

19475

MVT VecVT = Vec.getSimpleValueType();

19476

SDValue Idx = Op.getOperand(1);

19477

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

19478

19479

if (VecVT.getVectorElementType() == MVT::i1)

19480

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

19481

19482

if (!IdxC) {

19483

// Its more profitable to go through memory (1 cycles throughput)

19484

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

19485

// IACA tool was used to get performance estimation

19486

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

19487

//

19488

// example : extractelement <16 x i8> %a, i32 %i

19489

//

19490

// Block Throughput: 3.00 Cycles

19491

// Throughput Bottleneck: Port5

19492

//

19493

// | Num Of | Ports pressure in cycles | |

19494

// | Uops | 0 - DV | 5 | 6 | 7 | |

19495

// ---------------------------------------------

19496

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

19497

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

19498

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

19499

// Total Num Of Uops: 4

19500

//

19501

//

19502

// Block Throughput: 1.00 Cycles

19503

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

19504

//

19505

// | | Ports pressure in cycles | |

19506

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

19507

// ---------------------------------------------------------

19508

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

19509

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

19510

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

19511

// Total Num Of Uops: 4

19512

19513

return SDValue();

19514

}

19515

19516

unsigned IdxVal = IdxC->getZExtValue();

19517

19518

// If this is a 256-bit vector result, first extract the 128-bit vector and

19519

// then extract the element from the 128-bit vector.

19520

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

19521

// Get the 128-bit vector.

19522

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

19523

MVT EltVT = VecVT.getVectorElementType();

19524

19525

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

19526

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19526, __extension__
__PRETTY_FUNCTION__));

19527

19528

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

19529

// this can be done with a mask.

19530

IdxVal &= ElemsPerChunk - 1;

19531

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

19532

DAG.getIntPtrConstant(IdxVal, dl));

19533

}

19534

19535

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19535, __extension__
__PRETTY_FUNCTION__));

19536

19537

MVT VT = Op.getSimpleValueType();

19538

19539

if (VT == MVT::i16) {

19540

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

19541

// we're going to zero extend the register or fold the store (SSE41 only).

19542

if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

19543

!(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

19544

if (Subtarget.hasFP16())

19545

return Op;

19546

19547

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

19548

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

19549

DAG.getBitcast(MVT::v4i32, Vec), Idx));

19550

}

19551

19552

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

19553

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

19554

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

19555

}

19556

19557

if (Subtarget.hasSSE41())

19558

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

19559

return Res;

19560

19561

// TODO: We only extract a single element from v16i8, we can probably afford

19562

// to be more aggressive here before using the default approach of spilling to

19563

// stack.

19564

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

19565

// Extract either the lowest i32 or any i16, and extract the sub-byte.

19566

int DWordIdx = IdxVal / 4;

19567

if (DWordIdx == 0) {

19568

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

19569

DAG.getBitcast(MVT::v4i32, Vec),

19570

DAG.getIntPtrConstant(DWordIdx, dl));

19571

int ShiftVal = (IdxVal % 4) * 8;

19572

if (ShiftVal != 0)

19573

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

19574

DAG.getConstant(ShiftVal, dl, MVT::i8));

19575

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

19576

}

19577

19578

int WordIdx = IdxVal / 2;

19579

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

19580

DAG.getBitcast(MVT::v8i16, Vec),

19581

DAG.getIntPtrConstant(WordIdx, dl));

19582

int ShiftVal = (IdxVal % 2) * 8;

19583

if (ShiftVal != 0)

19584

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

19585

DAG.getConstant(ShiftVal, dl, MVT::i8));

19586

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

19587

}

19588

19589

if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

19590

if (IdxVal == 0)

19591

return Op;

19592

19593

// Shuffle the element to the lowest element, then movss or movsh.

19594

SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

19595

Mask[0] = static_cast<int>(IdxVal);

19596

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

19597

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

19598

DAG.getIntPtrConstant(0, dl));

19599

}

19600

19601

if (VT.getSizeInBits() == 64) {

19602

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

19603

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

19604

// to match extract_elt for f64.

19605

if (IdxVal == 0)

19606

return Op;

19607

19608

// UNPCKHPD the element to the lowest double word, then movsd.

19609

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

19610

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

19611

int Mask[2] = { 1, -1 };

19612

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

19613

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

19614

DAG.getIntPtrConstant(0, dl));

19615

}

19616

19617

return SDValue();

19618

}

19619

19620

/// Insert one bit to mask vector, like v16i1 or v8i1.

19621

/// AVX-512 feature.

19622

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

19623

const X86Subtarget &Subtarget) {

19624

SDLoc dl(Op);

19625

SDValue Vec = Op.getOperand(0);

19626

SDValue Elt = Op.getOperand(1);

19627

SDValue Idx = Op.getOperand(2);

19628

MVT VecVT = Vec.getSimpleValueType();

19629

19630

if (!isa<ConstantSDNode>(Idx)) {

19631

// Non constant index. Extend source and destination,

19632

// insert element and then truncate the result.

19633

unsigned NumElts = VecVT.getVectorNumElements();

19634

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

19635

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

19636

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

19637

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

19638

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

19639

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

19640

}

19641

19642

// Copy into a k-register, extract to v1i1 and insert_subvector.

19643

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

19644

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

19645

}

19646

19647

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

19648

SelectionDAG &DAG) const {

19649

MVT VT = Op.getSimpleValueType();

19650

MVT EltVT = VT.getVectorElementType();

19651

unsigned NumElts = VT.getVectorNumElements();

19652

unsigned EltSizeInBits = EltVT.getScalarSizeInBits();

19653

19654

if (EltVT == MVT::i1)

19655

return InsertBitToMaskVector(Op, DAG, Subtarget);

19656

19657

SDLoc dl(Op);

19658

SDValue N0 = Op.getOperand(0);

19659

SDValue N1 = Op.getOperand(1);

19660

SDValue N2 = Op.getOperand(2);

19661

auto *N2C = dyn_cast<ConstantSDNode>(N2);

19662

19663

if (!N2C) {

19664

// Variable insertion indices, usually we're better off spilling to stack,

19665

// but AVX512 can use a variable compare+select by comparing against all

19666

// possible vector indices, and FP insertion has less gpr->simd traffic.

19667

if (!(Subtarget.hasBWI() ||

19668

(Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

19669

(Subtarget.hasSSE41() && VT.isFloatingPoint())))

19670

return SDValue();

19671

19672

MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

19673

MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

19674

if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

19675

return SDValue();

19676

19677

SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

19678

SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

19679

SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

19680

19681

SmallVector<SDValue, 16> RawIndices;

19682

for (unsigned I = 0; I != NumElts; ++I)

19683

RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

19684

SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

19685

19686

// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

19687

return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

19688

ISD::CondCode::SETEQ);

19689

}

19690

19691

if (N2C->getAPIntValue().uge(NumElts))

19692

return SDValue();

19693

uint64_t IdxVal = N2C->getZExtValue();

19694

19695

bool IsZeroElt = X86::isZeroNode(N1);

19696

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

19697

19698

if (IsZeroElt || IsAllOnesElt) {

19699

// Lower insertion of i8 -1 as an 'OR' blend.

19700

// We don't deal with i8 0 since it appears to be handled elsewhere.

19701

if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {

19702

SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

19703

SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

19704

SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

19705

CstVectorElts[IdxVal] = OnesCst;

19706

SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

19707

return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

19708

}

19709

// See if we can do this more efficiently with a blend shuffle with a

19710

// rematerializable vector.

19711

if (Subtarget.hasSSE41() &&

19712

(EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

19713

SmallVector<int, 8> BlendMask;

19714

for (unsigned i = 0; i != NumElts; ++i)

19715

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

19716

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

19717

: getOnesVector(VT, DAG, dl);

19718

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

19719

}

19720

}

19721

19722

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

19723

// into that, and then insert the subvector back into the result.

19724

if (VT.is256BitVector() || VT.is512BitVector()) {

19725

// With a 256-bit vector, we can insert into the zero element efficiently

19726

// using a blend if we have AVX or AVX2 and the right data type.

19727

if (VT.is256BitVector() && IdxVal == 0) {

19728

// TODO: It is worthwhile to cast integer to floating point and back

19729

// and incur a domain crossing penalty if that's what we'll end up

19730

// doing anyway after extracting to a 128-bit vector.

19731

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

19732

(Subtarget.hasAVX2() && EltVT == MVT::i32)) {

19733

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

19734

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

19735

DAG.getTargetConstant(1, dl, MVT::i8));

19736

}

19737

}

19738

19739

unsigned NumEltsIn128 = 128 / EltSizeInBits;

19740

assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19741, __extension__
__PRETTY_FUNCTION__))

19741

"Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19741, __extension__
__PRETTY_FUNCTION__));

19742

19743

// If we are not inserting into the low 128-bit vector chunk,

19744

// then prefer the broadcast+blend sequence.

19745

// FIXME: relax the profitability check iff all N1 uses are insertions.

19746

if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&

19747

((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

19748

(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

19749

X86::mayFoldLoad(N1, Subtarget)))) {

19750

SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

19751

SmallVector<int, 8> BlendMask;

19752

for (unsigned i = 0; i != NumElts; ++i)

19753

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

19754

return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

19755

}

19756

19757

// Get the desired 128-bit vector chunk.

19758

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

19759

19760

// Insert the element into the desired chunk.

19761

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

19762

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

19763

19764

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

19765

DAG.getIntPtrConstant(IdxIn128, dl));

19766

19767

// Insert the changed part back into the bigger vector

19768

return insert128BitVector(N0, V, IdxVal, DAG, dl);

19769

}

19770

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19770, __extension__
__PRETTY_FUNCTION__));

19771

19772

// This will be just movw/movd/movq/movsh/movss/movsd.

19773

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

19774

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

19775

EltVT == MVT::f16 || EltVT == MVT::i64) {

19776

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

19777

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

19778

}

19779

19780

// We can't directly insert an i8 or i16 into a vector, so zero extend

19781

// it to i32 first.

19782

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

19783

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

19784

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

19785

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

19786

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

19787

return DAG.getBitcast(VT, N1);

19788

}

19789

}

19790

19791

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

19792

// argument. SSE41 required for pinsrb.

19793

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

19794

unsigned Opc;

19795

if (VT == MVT::v8i16) {

19796

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19796, __extension__
__PRETTY_FUNCTION__));

19797

Opc = X86ISD::PINSRW;

19798

} else {

19799

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19799, __extension__
__PRETTY_FUNCTION__));

19800

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19800, __extension__
__PRETTY_FUNCTION__));

19801

Opc = X86ISD::PINSRB;

19802

}

19803

19804

assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19804, __extension__
__PRETTY_FUNCTION__));

19805

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

19806

N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

19807

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

19808

}

19809

19810

if (Subtarget.hasSSE41()) {

19811

if (EltVT == MVT::f32) {

19812

// Bits [7:6] of the constant are the source select. This will always be

19813

// zero here. The DAG Combiner may combine an extract_elt index into

19814

// these bits. For example (insert (extract, 3), 2) could be matched by

19815

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

19816

// Bits [5:4] of the constant are the destination select. This is the

19817

// value of the incoming immediate.

19818

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

19819

// combine either bitwise AND or insert of float 0.0 to set these bits.

19820

19821

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

19822

if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

19823

// If this is an insertion of 32-bits into the low 32-bits of

19824

// a vector, we prefer to generate a blend with immediate rather

19825

// than an insertps. Blends are simpler operations in hardware and so

19826

// will always have equal or better performance than insertps.

19827

// But if optimizing for size and there's a load folding opportunity,

19828

// generate insertps because blendps does not have a 32-bit memory

19829

// operand form.

19830

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

19831

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

19832

DAG.getTargetConstant(1, dl, MVT::i8));

19833

}

19834

// Create this as a scalar to vector..

19835

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

19836

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

19837

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

19838

}

19839

19840

// PINSR* works with constant index.

19841

if (EltVT == MVT::i32 || EltVT == MVT::i64)

19842

return Op;

19843

}

19844

19845

return SDValue();

19846

}

19847

19848

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

19849

SelectionDAG &DAG) {

19850

SDLoc dl(Op);

19851

MVT OpVT = Op.getSimpleValueType();

19852

19853

// It's always cheaper to replace a xor+movd with xorps and simplifies further

19854

// combines.

19855

if (X86::isZeroNode(Op.getOperand(0)))

19856

return getZeroVector(OpVT, Subtarget, DAG, dl);

19857

19858

// If this is a 256-bit vector result, first insert into a 128-bit

19859

// vector and then insert into the 256-bit vector.

19860

if (!OpVT.is128BitVector()) {

19861

// Insert into a 128-bit vector.

19862

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

19863

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

19864

OpVT.getVectorNumElements() / SizeFactor);

19865

19866

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

19867

19868

// Insert the 128-bit vector.

19869

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

19870

}

19871

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19872, __extension__
__PRETTY_FUNCTION__))

19872

"Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19872, __extension__
__PRETTY_FUNCTION__));

19873

19874

// Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

19875

// tblgen.

19876

if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

19877

return Op;

19878

19879

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

19880

return DAG.getBitcast(

19881

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

19882

}

19883

19884

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

19885

// simple superregister reference or explicit instructions to insert

19886

// the upper bits of a vector.

19887

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

19888

SelectionDAG &DAG) {

19889

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19889, __extension__
__PRETTY_FUNCTION__));

19890

19891

return insert1BitVector(Op, DAG, Subtarget);

19892

}

19893

19894

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

19895

SelectionDAG &DAG) {

19896

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__))

19897

"Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__));

19898

19899

SDLoc dl(Op);

19900

SDValue Vec = Op.getOperand(0);

19901

uint64_t IdxVal = Op.getConstantOperandVal(1);

19902

19903

if (IdxVal == 0) // the operation is legal

19904

return Op;

19905

19906

MVT VecVT = Vec.getSimpleValueType();

19907

unsigned NumElems = VecVT.getVectorNumElements();

19908

19909

// Extend to natively supported kshift.

19910

MVT WideVecVT = VecVT;

19911

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

19912

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19913

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

19914

DAG.getUNDEF(WideVecVT), Vec,

19915

DAG.getIntPtrConstant(0, dl));

19916

}

19917

19918

// Shift to the LSB.

19919

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

19920

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

19921

19922

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

19923

DAG.getIntPtrConstant(0, dl));

19924

}

19925

19926

// Returns the appropriate wrapper opcode for a global reference.

19927

unsigned X86TargetLowering::getGlobalWrapperKind(

19928

const GlobalValue *GV, const unsigned char OpFlags) const {

19929

// References to absolute symbols are never PC-relative.

19930

if (GV && GV->isAbsoluteSymbolRef())

19931

return X86ISD::Wrapper;

19932

19933

CodeModel::Model M = getTargetMachine().getCodeModel();

19934

if (Subtarget.isPICStyleRIPRel() &&

19935

(M == CodeModel::Small || M == CodeModel::Kernel))

19936

return X86ISD::WrapperRIP;

19937

19938

// GOTPCREL references must always use RIP.

19939

if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

19940

return X86ISD::WrapperRIP;

19941

19942

return X86ISD::Wrapper;

19943

}

19944

19945

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

19946

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

19947

// one of the above mentioned nodes. It has to be wrapped because otherwise

19948

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

19949

// be used to form addressing mode. These wrapped nodes will be selected

19950

// into MOV32ri.

19951

SDValue

19952

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

19953

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

19954

19955

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

19956

// global base reg.

19957

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

19958

19959

auto PtrVT = getPointerTy(DAG.getDataLayout());

19960

SDValue Result = DAG.getTargetConstantPool(

19961

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

19962

SDLoc DL(CP);

19963

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

19964

// With PIC, the address is actually $g + Offset.

19965

if (OpFlag) {

19966

Result =

19967

DAG.getNode(ISD::ADD, DL, PtrVT,

19968

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

19969

}

19970

19971

return Result;

19972

}

19973

19974

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

19975

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

19976

19977

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

19978

// global base reg.

19979

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

19980

19981

auto PtrVT = getPointerTy(DAG.getDataLayout());

19982

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

19983

SDLoc DL(JT);

19984

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

19985

19986

// With PIC, the address is actually $g + Offset.

19987

if (OpFlag)

19988

Result =

19989

DAG.getNode(ISD::ADD, DL, PtrVT,

19990

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

19991

19992

return Result;

19993

}

19994

19995

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

19996

SelectionDAG &DAG) const {

19997

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

19998

}

19999

20000

SDValue

20001

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

20002

// Create the TargetBlockAddressAddress node.

20003

unsigned char OpFlags =

20004

Subtarget.classifyBlockAddressReference();

20005

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

20006

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

20007

SDLoc dl(Op);

20008

auto PtrVT = getPointerTy(DAG.getDataLayout());

20009

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

20010

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

20011

20012

// With PIC, the address is actually $g + Offset.

20013

if (isGlobalRelativeToPICBase(OpFlags)) {

20014

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20015

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20016

}

20017

20018

return Result;

20019

}

20020

20021

/// Creates target global address or external symbol nodes for calls or

20022

/// other uses.

20023

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

20024

bool ForCall) const {

20025

// Unpack the global address or external symbol.

20026

const SDLoc &dl = SDLoc(Op);

20027

const GlobalValue *GV = nullptr;

20028

int64_t Offset = 0;

20029

const char *ExternalSym = nullptr;

20030

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

20031

GV = G->getGlobal();

20032

Offset = G->getOffset();

20033

} else {

20034

const auto *ES = cast<ExternalSymbolSDNode>(Op);

20035

ExternalSym = ES->getSymbol();

20036

}

20037

20038

// Calculate some flags for address lowering.

20039

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

20040

unsigned char OpFlags;

20041

if (ForCall)

20042

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

20043

else

20044

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

20045

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

20046

bool NeedsLoad = isGlobalStubReference(OpFlags);

20047

20048

CodeModel::Model M = DAG.getTarget().getCodeModel();

20049

auto PtrVT = getPointerTy(DAG.getDataLayout());

20050

SDValue Result;

20051

20052

if (GV) {

20053

// Create a target global address if this is a global. If possible, fold the

20054

// offset into the global address reference. Otherwise, ADD it on later.

20055

// Suppress the folding if Offset is negative: movl foo-1, %eax is not

20056

// allowed because if the address of foo is 0, the ELF R_X86_64_32

20057

// relocation will compute to a negative value, which is invalid.

20058

int64_t GlobalOffset = 0;

20059

if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

20060

X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

20061

std::swap(GlobalOffset, Offset);

20062

}

20063

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

20064

} else {

20065

// If this is not a global address, this must be an external symbol.

20066

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

20067

}

20068

20069

// If this is a direct call, avoid the wrapper if we don't need to do any

20070

// loads or adds. This allows SDAG ISel to match direct calls.

20071

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

20072

return Result;

20073

20074

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

20075

20076

// With PIC, the address is actually $g + Offset.

20077

if (HasPICReg) {

20078

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20079

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20080

}

20081

20082

// For globals that require a load from a stub to get the address, emit the

20083

// load.

20084

if (NeedsLoad)

20085

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

20086

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20087

20088

// If there was a non-zero offset that we didn't fold, create an explicit

20089

// addition for it.

20090

if (Offset != 0)

20091

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

20092

DAG.getConstant(Offset, dl, PtrVT));

20093

20094

return Result;

20095

}

20096

20097

SDValue

20098

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

20099

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20100

}

20101

20102

static SDValue

20103

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

20104

SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,

20105

unsigned char OperandFlags, bool LocalDynamic = false) {

20106

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20107

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

20108

SDLoc dl(GA);

20109

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20110

GA->getValueType(0),

20111

GA->getOffset(),

20112

OperandFlags);

20113

20114

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

20115

: X86ISD::TLSADDR;

20116

20117

if (InFlag) {

20118

SDValue Ops[] = { Chain, TGA, *InFlag };

20119

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20120

} else {

20121

SDValue Ops[] = { Chain, TGA };

20122

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20123

}

20124

20125

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

20126

MFI.setAdjustsStack(true);

20127

MFI.setHasCalls(true);

20128

20129

SDValue Flag = Chain.getValue(1);

20130

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);

20131

}

20132

20133

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

20134

static SDValue

20135

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20136

const EVT PtrVT) {

20137

SDValue InFlag;

20138

SDLoc dl(GA); // ? function entry point might be better

20139

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

20140

DAG.getNode(X86ISD::GlobalBaseReg,

20141

SDLoc(), PtrVT), InFlag);

20142

InFlag = Chain.getValue(1);

20143

20144

return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);

20145

}

20146

20147

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

20148

static SDValue

20149

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20150

const EVT PtrVT) {

20151

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

20152

X86::RAX, X86II::MO_TLSGD);

20153

}

20154

20155

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

20156

static SDValue

20157

LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20158

const EVT PtrVT) {

20159

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

20160

X86::EAX, X86II::MO_TLSGD);

20161

}

20162

20163

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

20164

SelectionDAG &DAG, const EVT PtrVT,

20165

bool Is64Bit, bool Is64BitLP64) {

20166

SDLoc dl(GA);

20167

20168

// Get the start address of the TLS block for this module.

20169

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

20170

.getInfo<X86MachineFunctionInfo>();

20171

MFI->incNumLocalDynamicTLSAccesses();

20172

20173

SDValue Base;

20174

if (Is64Bit) {

20175

unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

20176

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,

20177

X86II::MO_TLSLD, /*LocalDynamic=*/true);

20178

} else {

20179

SDValue InFlag;

20180

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

20181

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);

20182

InFlag = Chain.getValue(1);

20183

Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,

20184

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

20185

}

20186

20187

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

20188

// of Base.

20189

20190

// Build x@dtpoff.

20191

unsigned char OperandFlags = X86II::MO_DTPOFF;

20192

unsigned WrapperKind = X86ISD::Wrapper;

20193

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20194

GA->getValueType(0),

20195

GA->getOffset(), OperandFlags);

20196

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

20197

20198

// Add x@dtpoff with the base.

20199

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

20200

}

20201

20202

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

20203

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20204

const EVT PtrVT, TLSModel::Model model,

20205

bool is64Bit, bool isPIC) {

20206

SDLoc dl(GA);

20207

20208

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

20209

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

20210

is64Bit ? 257 : 256));

20211

20212

SDValue ThreadPointer =

20213

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

20214

MachinePointerInfo(Ptr));

20215

20216

unsigned char OperandFlags = 0;

20217

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

20218

// initialexec.

20219

unsigned WrapperKind = X86ISD::Wrapper;

20220

if (model == TLSModel::LocalExec) {

20221

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

20222

} else if (model == TLSModel::InitialExec) {

20223

if (is64Bit) {

20224

OperandFlags = X86II::MO_GOTTPOFF;

20225

WrapperKind = X86ISD::WrapperRIP;

20226

} else {

20227

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

20228

}

20229

} else {

20230

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20230);

20231

}

20232

20233

// emit "addl x@ntpoff,%eax" (local exec)

20234

// or "addl x@indntpoff,%eax" (initial exec)

20235

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

20236

SDValue TGA =

20237

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

20238

GA->getOffset(), OperandFlags);

20239

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

20240

20241

if (model == TLSModel::InitialExec) {

20242

if (isPIC && !is64Bit) {

20243

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

20244

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

20245

Offset);

20246

}

20247

20248

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

20249

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20250

}

20251

20252

// The address of the thread local variable is the add of the thread

20253

// pointer with the offset of the variable.

20254

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

20255

}

20256

20257

SDValue

20258

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

20259

20260

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

20261

20262

if (DAG.getTarget().useEmulatedTLS())

20263

return LowerToTLSEmulatedModel(GA, DAG);

20264

20265

const GlobalValue *GV = GA->getGlobal();

20266

auto PtrVT = getPointerTy(DAG.getDataLayout());

20267

bool PositionIndependent = isPositionIndependent();

20268

20269

if (Subtarget.isTargetELF()) {

20270

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

20271

switch (model) {

20272

case TLSModel::GeneralDynamic:

20273

if (Subtarget.is64Bit()) {

20274

if (Subtarget.isTarget64BitLP64())

20275

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

20276

return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

20277

}

20278

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

20279

case TLSModel::LocalDynamic:

20280

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

20281

Subtarget.isTarget64BitLP64());

20282

case TLSModel::InitialExec:

20283

case TLSModel::LocalExec:

20284

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

20285

PositionIndependent);

20286

}

20287

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20287);

20288

}

20289

20290

if (Subtarget.isTargetDarwin()) {

20291

// Darwin only has one model of TLS. Lower to that.

20292

unsigned char OpFlag = 0;

20293

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

20294

X86ISD::WrapperRIP : X86ISD::Wrapper;

20295

20296

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20297

// global base reg.

20298

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

20299

if (PIC32)

20300

OpFlag = X86II::MO_TLVP_PIC_BASE;

20301

else

20302

OpFlag = X86II::MO_TLVP;

20303

SDLoc DL(Op);

20304

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

20305

GA->getValueType(0),

20306

GA->getOffset(), OpFlag);

20307

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

20308

20309

// With PIC32, the address is actually $g + Offset.

20310

if (PIC32)

20311

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

20312

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

20313

Offset);

20314

20315

// Lowering the machine isd will make sure everything is in the right

20316

// location.

20317

SDValue Chain = DAG.getEntryNode();

20318

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

20319

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

20320

SDValue Args[] = { Chain, Offset };

20321

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

20322

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),

20323

DAG.getIntPtrConstant(0, DL, true),

20324

Chain.getValue(1), DL);

20325

20326

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

20327

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20328

MFI.setAdjustsStack(true);

20329

20330

// And our return value (tls address) is in the standard call return value

20331

// location.

20332

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

20333

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

20334

}

20335

20336

if (Subtarget.isOSWindows()) {

20337

// Just use the implicit TLS architecture

20338

// Need to generate something similar to:

20339

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

20340

// ; from TEB

20341

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

20342

// mov rcx, qword [rdx+rcx*8]

20343

// mov eax, .tls$:tlsvar

20344

// [rax+rcx] contains the address

20345

// Windows 64bit: gs:0x58

20346

// Windows 32bit: fs:__tls_array

20347

20348

SDLoc dl(GA);

20349

SDValue Chain = DAG.getEntryNode();

20350

20351

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

20352

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

20353

// use its literal value of 0x2C.

20354

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

20355

? Type::getInt8PtrTy(*DAG.getContext(),

20356

256)

20357

: Type::getInt32PtrTy(*DAG.getContext(),

20358

257));

20359

20360

SDValue TlsArray = Subtarget.is64Bit()

20361

? DAG.getIntPtrConstant(0x58, dl)

20362

: (Subtarget.isTargetWindowsGNU()

20363

? DAG.getIntPtrConstant(0x2C, dl)

20364

: DAG.getExternalSymbol("_tls_array", PtrVT));

20365

20366

SDValue ThreadPointer =

20367

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

20368

20369

SDValue res;

20370

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

20371

res = ThreadPointer;

20372

} else {

20373

// Load the _tls_index variable

20374

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

20375

if (Subtarget.is64Bit())

20376

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

20377

MachinePointerInfo(), MVT::i32);

20378

else

20379

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

20380

20381

const DataLayout &DL = DAG.getDataLayout();

20382

SDValue Scale =

20383

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

20384

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

20385

20386

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

20387

}

20388

20389

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

20390

20391

// Get the offset of start of .tls section

20392

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20393

GA->getValueType(0),

20394

GA->getOffset(), X86II::MO_SECREL);

20395

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

20396

20397

// The address of the thread local variable is the add of the thread

20398

// pointer with the offset of the variable.

20399

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

20400

}

20401

20402

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20402);

20403

}

20404

20405

/// Lower SRA_PARTS and friends, which return two i32 values

20406

/// and take a 2 x i32 value to shift plus a shift amount.

20407

/// TODO: Can this be moved to general expansion code?

20408

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

20409

SDValue Lo, Hi;

20410

DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

20411

return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

20412

}

20413

20414

// Try to use a packed vector operation to handle i64 on 32-bit targets when

20415

// AVX512DQ is enabled.

20416

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

20417

const X86Subtarget &Subtarget) {

20418

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))

20419

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))

20420

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))

20421

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__))

20422

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20422, __extension__
__PRETTY_FUNCTION__));

20423

bool IsStrict = Op->isStrictFPOpcode();

20424

unsigned OpNo = IsStrict ? 1 : 0;

20425

SDValue Src = Op.getOperand(OpNo);

20426

MVT SrcVT = Src.getSimpleValueType();

20427

MVT VT = Op.getSimpleValueType();

20428

20429

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

20430

(VT != MVT::f32 && VT != MVT::f64))

20431

return SDValue();

20432

20433

// Pack the i64 into a vector, do the operation and extract.

20434

20435

// Using 256-bit to ensure result is 128-bits for f32 case.

20436

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

20437

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

20438

MVT VecVT = MVT::getVectorVT(VT, NumElts);

20439

20440

SDLoc dl(Op);

20441

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

20442

if (IsStrict) {

20443

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

20444

{Op.getOperand(0), InVec});

20445

SDValue Chain = CvtVec.getValue(1);

20446

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

20447

DAG.getIntPtrConstant(0, dl));

20448

return DAG.getMergeValues({Value, Chain}, dl);

20449

}

20450

20451

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

20452

20453

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

20454

DAG.getIntPtrConstant(0, dl));

20455

}

20456

20457

// Try to use a packed vector operation to handle i64 on 32-bit targets.

20458

static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,

20459

const X86Subtarget &Subtarget) {

20460

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))

20461

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))

20462

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))

20463

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__))

20464

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20464, __extension__
__PRETTY_FUNCTION__));

20465

bool IsStrict = Op->isStrictFPOpcode();

20466

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

20467

MVT SrcVT = Src.getSimpleValueType();

20468

MVT VT = Op.getSimpleValueType();

20469

20470

if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

20471

return SDValue();

20472

20473

// Pack the i64 into a vector, do the operation and extract.

20474

20475

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20475, __extension__
__PRETTY_FUNCTION__));

20476

20477

SDLoc dl(Op);

20478

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

20479

if (IsStrict) {

20480

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

20481

{Op.getOperand(0), InVec});

20482

SDValue Chain = CvtVec.getValue(1);

20483

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

20484

DAG.getIntPtrConstant(0, dl));

20485

return DAG.getMergeValues({Value, Chain}, dl);

20486

}

20487

20488

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);

20489

20490

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

20491

DAG.getIntPtrConstant(0, dl));

20492

}

20493

20494

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

20495

const X86Subtarget &Subtarget) {

20496

switch (Opcode) {

20497

case ISD::SINT_TO_FP:

20498

// TODO: Handle wider types with AVX/AVX512.

20499

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

20500

return false;

20501

// CVTDQ2PS or (V)CVTDQ2PD

20502

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

20503

20504

case ISD::UINT_TO_FP:

20505

// TODO: Handle wider types and i64 elements.

20506

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

20507

return false;

20508

// VCVTUDQ2PS or VCVTUDQ2PD

20509

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

20510

20511

default:

20512

return false;

20513

}

20514

}

20515

20516

/// Given a scalar cast operation that is extracted from a vector, try to

20517

/// vectorize the cast op followed by extraction. This will avoid an expensive

20518

/// round-trip between XMM and GPR.

20519

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

20520

const X86Subtarget &Subtarget) {

20521

// TODO: This could be enhanced to handle smaller integer types by peeking

20522

// through an extend.

20523

SDValue Extract = Cast.getOperand(0);

20524

MVT DestVT = Cast.getSimpleValueType();

20525

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

20526

!isa<ConstantSDNode>(Extract.getOperand(1)))

20527

return SDValue();

20528

20529

// See if we have a 128-bit vector cast op for this type of cast.

20530

SDValue VecOp = Extract.getOperand(0);

20531

MVT FromVT = VecOp.getSimpleValueType();

20532

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

20533

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

20534

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

20535

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

20536

return SDValue();

20537

20538

// If we are extracting from a non-zero element, first shuffle the source

20539

// vector to allow extracting from element zero.

20540

SDLoc DL(Cast);

20541

if (!isNullConstant(Extract.getOperand(1))) {

20542

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

20543

Mask[0] = Extract.getConstantOperandVal(1);

20544

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

20545

}

20546

// If the source vector is wider than 128-bits, extract the low part. Do not

20547

// create an unnecessarily wide vector cast op.

20548

if (FromVT != Vec128VT)

20549

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

20550

20551

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

20552

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

20553

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

20554

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

20555

DAG.getIntPtrConstant(0, DL));

20556

}

20557

20558

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

20559

/// try to vectorize the cast ops. This will avoid an expensive round-trip

20560

/// between XMM and GPR.

20561

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

20562

const X86Subtarget &Subtarget) {

20563

// TODO: Allow FP_TO_UINT.

20564

SDValue CastToInt = CastToFP.getOperand(0);

20565

MVT VT = CastToFP.getSimpleValueType();

20566

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

20567

return SDValue();

20568

20569

MVT IntVT = CastToInt.getSimpleValueType();

20570

SDValue X = CastToInt.getOperand(0);

20571

MVT SrcVT = X.getSimpleValueType();

20572

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

20573

return SDValue();

20574

20575

// See if we have 128-bit vector cast instructions for this type of cast.

20576

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

20577

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

20578

IntVT != MVT::i32)

20579

return SDValue();

20580

20581

unsigned SrcSize = SrcVT.getSizeInBits();

20582

unsigned IntSize = IntVT.getSizeInBits();

20583

unsigned VTSize = VT.getSizeInBits();

20584

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

20585

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

20586

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

20587

20588

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

20589

unsigned ToIntOpcode =

20590

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

20591

unsigned ToFPOpcode =

20592

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

20593

20594

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

20595

//

20596

// We are not defining the high elements (for example, zero them) because

20597

// that could nullify any performance advantage that we hoped to gain from

20598

// this vector op hack. We do not expect any adverse effects (like denorm

20599

// penalties) with cast ops.

20600

SDLoc DL(CastToFP);

20601

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

20602

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

20603

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

20604

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

20605

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

20606

}

20607

20608

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

20609

const X86Subtarget &Subtarget) {

20610

SDLoc DL(Op);

20611

bool IsStrict = Op->isStrictFPOpcode();

20612

MVT VT = Op->getSimpleValueType(0);

20613

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

20614

20615

if (Subtarget.hasDQI()) {

20616

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20616, __extension__
__PRETTY_FUNCTION__));

20617

20618

assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))

20619

Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__))

20620

"Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20620, __extension__
__PRETTY_FUNCTION__));

20621

20622

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

20623

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20624, __extension__
__PRETTY_FUNCTION__))

20624

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20624, __extension__
__PRETTY_FUNCTION__));

20625

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

20626

20627

// Need to concat with zero vector for strict fp to avoid spurious

20628

// exceptions.

20629

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

20630

: DAG.getUNDEF(MVT::v8i64);

20631

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

20632

DAG.getIntPtrConstant(0, DL));

20633

SDValue Res, Chain;

20634

if (IsStrict) {

20635

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

20636

{Op->getOperand(0), Src});

20637

Chain = Res.getValue(1);

20638

} else {

20639

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

20640

}

20641

20642

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

20643

DAG.getIntPtrConstant(0, DL));

20644

20645

if (IsStrict)

20646

return DAG.getMergeValues({Res, Chain}, DL);

20647

return Res;

20648

}

20649

20650

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

20651

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

20652

if (VT != MVT::v4f32 || IsSigned)

20653

return SDValue();

20654

20655

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

20656

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

20657

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

20658

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

20659

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

20660

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

20661

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

20662

SmallVector<SDValue, 4> SignCvts(4);

20663

SmallVector<SDValue, 4> Chains(4);

20664

for (int i = 0; i != 4; ++i) {

20665

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

20666

DAG.getIntPtrConstant(i, DL));

20667

if (IsStrict) {

20668

SignCvts[i] =

20669

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

20670

{Op.getOperand(0), Elt});

20671

Chains[i] = SignCvts[i].getValue(1);

20672

} else {

20673

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

20674

}

20675

}

20676

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

20677

20678

SDValue Slow, Chain;

20679

if (IsStrict) {

20680

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

20681

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

20682

{Chain, SignCvt, SignCvt});

20683

Chain = Slow.getValue(1);

20684

} else {

20685

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

20686

}

20687

20688

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

20689

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

20690

20691

if (IsStrict)

20692

return DAG.getMergeValues({Cvt, Chain}, DL);

20693

20694

return Cvt;

20695

}

20696

20697

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

20698

SelectionDAG &DAG) const {

20699

bool IsStrict = Op->isStrictFPOpcode();

20700

unsigned OpNo = IsStrict ? 1 : 0;

20701

SDValue Src = Op.getOperand(OpNo);

20702

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

20703

MVT SrcVT = Src.getSimpleValueType();

20704

MVT VT = Op.getSimpleValueType();

20705

SDLoc dl(Op);

20706

20707

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

20708

return LowerWin64_INT128_TO_FP(Op, DAG);

20709

20710

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

20711

return Extract;

20712

20713

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

20714

return R;

20715

20716

if (SrcVT.isVector()) {

20717

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

20718

// Note: Since v2f64 is a legal type. We don't need to zero extend the

20719

// source for strict FP.

20720

if (IsStrict)

20721

return DAG.getNode(

20722

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

20723

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

20724

DAG.getUNDEF(SrcVT))});

20725

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

20726

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

20727

DAG.getUNDEF(SrcVT)));

20728

}

20729

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

20730

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

20731

20732

return SDValue();

20733

}

20734

20735

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20736, __extension__
__PRETTY_FUNCTION__))

20736

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20736, __extension__
__PRETTY_FUNCTION__));

20737

20738

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

20739

20740

// These are really Legal; return the operand so the caller accepts it as

20741

// Legal.

20742

if (SrcVT == MVT::i32 && UseSSEReg)

20743

return Op;

20744

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

20745

return Op;

20746

20747

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

20748

return V;

20749

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

20750

return V;

20751

20752

// SSE doesn't have an i16 conversion so we need to promote.

20753

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

20754

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

20755

if (IsStrict)

20756

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

20757

{Chain, Ext});

20758

20759

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

20760

}

20761

20762

if (VT == MVT::f128)

20763

return SDValue();

20764

20765

SDValue ValueToStore = Src;

20766

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

20767

// Bitcasting to f64 here allows us to do a single 64-bit store from

20768

// an SSE register, avoiding the store forwarding penalty that would come

20769

// with two 32-bit stores.

20770

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

20771

20772

unsigned Size = SrcVT.getStoreSize();

20773

Align Alignment(Size);

20774

MachineFunction &MF = DAG.getMachineFunction();

20775

auto PtrVT = getPointerTy(MF.getDataLayout());

20776

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

20777

MachinePointerInfo MPI =

20778

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

20779

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

20780

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

20781

std::pair<SDValue, SDValue> Tmp =

20782

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

20783

20784

if (IsStrict)

20785

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

20786

20787

return Tmp.first;

20788

}

20789

20790

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

20791

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

20792

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

20793

// Build the FILD

20794

SDVTList Tys;

20795

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

20796

if (useSSE)

20797

Tys = DAG.getVTList(MVT::f80, MVT::Other);

20798

else

20799

Tys = DAG.getVTList(DstVT, MVT::Other);

20800

20801

SDValue FILDOps[] = {Chain, Pointer};

20802

SDValue Result =

20803

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

20804

Alignment, MachineMemOperand::MOLoad);

20805

Chain = Result.getValue(1);

20806

20807

if (useSSE) {

20808

MachineFunction &MF = DAG.getMachineFunction();

20809

unsigned SSFISize = DstVT.getStoreSize();

20810

int SSFI =

20811

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

20812

auto PtrVT = getPointerTy(MF.getDataLayout());

20813

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

20814

Tys = DAG.getVTList(MVT::Other);

20815

SDValue FSTOps[] = {Chain, Result, StackSlot};

20816

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

20817

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

20818

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

20819

20820

Chain =

20821

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

20822

Result = DAG.getLoad(

20823

DstVT, DL, Chain, StackSlot,

20824

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

20825

Chain = Result.getValue(1);

20826

}

20827

20828

return { Result, Chain };

20829

}

20830

20831

/// Horizontal vector math instructions may be slower than normal math with

20832

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

20833

/// implementation, and likely shuffle complexity of the alternate sequence.

20834

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

20835

const X86Subtarget &Subtarget) {

20836

bool IsOptimizingSize = DAG.shouldOptForSize();

20837

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

20838

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

20839

}

20840

20841

/// 64-bit unsigned integer to double expansion.

20842

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

20843

const X86Subtarget &Subtarget) {

20844

// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

20845

// when converting 0 when rounding toward negative infinity. Caller will

20846

// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

20847

assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20847, __extension__
__PRETTY_FUNCTION__));

20848

// This algorithm is not obvious. Here it is what we're trying to output:

20849

/*

20850

movq %rax, %xmm0

20851

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

20852

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

20853

#ifdef __SSE3__

20854

haddpd %xmm0, %xmm0

20855

#else

20856

pshufd $0x4e, %xmm0, %xmm1

20857

addpd %xmm1, %xmm0

20858

#endif

20859

*/

20860

20861

SDLoc dl(Op);

20862

LLVMContext *Context = DAG.getContext();

20863

20864

// Build some magic constants.

20865

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

20866

Constant *C0 = ConstantDataVector::get(*Context, CV0);

20867

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

20868

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

20869

20870

SmallVector<Constant*,2> CV1;

20871

CV1.push_back(

20872

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

20873

APInt(64, 0x4330000000000000ULL))));

20874

CV1.push_back(

20875

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

20876

APInt(64, 0x4530000000000000ULL))));

20877

Constant *C1 = ConstantVector::get(CV1);

20878

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

20879

20880

// Load the 64-bit value into an XMM register.

20881

SDValue XR1 =

20882

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

20883

SDValue CLod0 = DAG.getLoad(

20884

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

20885

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

20886

SDValue Unpck1 =

20887

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

20888

20889

SDValue CLod1 = DAG.getLoad(

20890

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

20891

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

20892

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

20893

// TODO: Are there any fast-math-flags to propagate here?

20894

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

20895

SDValue Result;

20896

20897

if (Subtarget.hasSSE3() &&

20898

shouldUseHorizontalOp(true, DAG, Subtarget)) {

20899

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

20900

} else {

20901

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

20902

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

20903

}

20904

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

20905

DAG.getIntPtrConstant(0, dl));

20906

return Result;

20907

}

20908

20909

/// 32-bit unsigned integer to float expansion.

20910

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

20911

const X86Subtarget &Subtarget) {

20912

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

20913

SDLoc dl(Op);

20914

// FP constant to bias correct the final result.

20915

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,

20916

MVT::f64);

20917

20918

// Load the 32-bit value into an XMM register.

20919

SDValue Load =

20920

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

20921

20922

// Zero out the upper parts of the register.

20923

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

20924

20925

// Or the load with the bias.

20926

SDValue Or = DAG.getNode(

20927

ISD::OR, dl, MVT::v2i64,

20928

DAG.getBitcast(MVT::v2i64, Load),

20929

DAG.getBitcast(MVT::v2i64,

20930

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

20931

Or =

20932

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

20933

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

20934

20935

if (Op.getNode()->isStrictFPOpcode()) {

20936

// Subtract the bias.

20937

// TODO: Are there any fast-math-flags to propagate here?

20938

SDValue Chain = Op.getOperand(0);

20939

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

20940

{Chain, Or, Bias});

20941

20942

if (Op.getValueType() == Sub.getValueType())

20943

return Sub;

20944

20945

// Handle final rounding.

20946

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

20947

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

20948

20949

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

20950

}

20951

20952

// Subtract the bias.

20953

// TODO: Are there any fast-math-flags to propagate here?

20954

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

20955

20956

// Handle final rounding.

20957

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

20958

}

20959

20960

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

20961

const X86Subtarget &Subtarget,

20962

const SDLoc &DL) {

20963

if (Op.getSimpleValueType() != MVT::v2f64)

20964

return SDValue();

20965

20966

bool IsStrict = Op->isStrictFPOpcode();

20967

20968

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

20969

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20969, __extension__
__PRETTY_FUNCTION__));

20970

20971

if (Subtarget.hasAVX512()) {

20972

if (!Subtarget.hasVLX()) {

20973

// Let generic type legalization widen this.

20974

if (!IsStrict)

20975

return SDValue();

20976

// Otherwise pad the integer input with 0s and widen the operation.

20977

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

20978

DAG.getConstant(0, DL, MVT::v2i32));

20979

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

20980

{Op.getOperand(0), N0});

20981

SDValue Chain = Res.getValue(1);

20982

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

20983

DAG.getIntPtrConstant(0, DL));

20984

return DAG.getMergeValues({Res, Chain}, DL);

20985

}

20986

20987

// Legalize to v4i32 type.

20988

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

20989

DAG.getUNDEF(MVT::v2i32));

20990

if (IsStrict)

20991

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

20992

{Op.getOperand(0), N0});

20993

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

20994

}

20995

20996

// Zero extend to 2i64, OR with the floating point representation of 2^52.

20997

// This gives us the floating point equivalent of 2^52 + the i32 integer

20998

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

20999

// point leaving just our i32 integers in double format.

21000

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

21001

SDValue VBias =

21002

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);

21003

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

21004

DAG.getBitcast(MVT::v2i64, VBias));

21005

Or = DAG.getBitcast(MVT::v2f64, Or);

21006

21007

if (IsStrict)

21008

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

21009

{Op.getOperand(0), Or, VBias});

21010

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

21011

}

21012

21013

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

21014

const X86Subtarget &Subtarget) {

21015

SDLoc DL(Op);

21016

bool IsStrict = Op->isStrictFPOpcode();

21017

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

21018

MVT VecIntVT = V.getSimpleValueType();

21019

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21020, __extension__
__PRETTY_FUNCTION__))

21020

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21020, __extension__
__PRETTY_FUNCTION__));

21021

21022

if (Subtarget.hasAVX512()) {

21023

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

21024

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21024, __extension__
__PRETTY_FUNCTION__));

21025

MVT VT = Op->getSimpleValueType(0);

21026

21027

// v8i32->v8f64 is legal with AVX512 so just return it.

21028

if (VT == MVT::v8f64)

21029

return Op;

21030

21031

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21032, __extension__
__PRETTY_FUNCTION__))

21032

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21032, __extension__
__PRETTY_FUNCTION__));

21033

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

21034

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

21035

// Need to concat with zero vector for strict fp to avoid spurious

21036

// exceptions.

21037

SDValue Tmp =

21038

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

21039

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

21040

DAG.getIntPtrConstant(0, DL));

21041

SDValue Res, Chain;

21042

if (IsStrict) {

21043

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

21044

{Op->getOperand(0), V});

21045

Chain = Res.getValue(1);

21046

} else {

21047

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

21048

}

21049

21050

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21051

DAG.getIntPtrConstant(0, DL));

21052

21053

if (IsStrict)

21054

return DAG.getMergeValues({Res, Chain}, DL);

21055

return Res;

21056

}

21057

21058

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

21059

Op->getSimpleValueType(0) == MVT::v4f64) {

21060

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

21061

Constant *Bias = ConstantFP::get(

21062

*DAG.getContext(),

21063

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

21064

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21065

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

21066

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

21067

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

21068

SDValue VBias = DAG.getMemIntrinsicNode(

21069

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

21070

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

21071

MachineMemOperand::MOLoad);

21072

21073

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

21074

DAG.getBitcast(MVT::v4i64, VBias));

21075

Or = DAG.getBitcast(MVT::v4f64, Or);

21076

21077

if (IsStrict)

21078

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

21079

{Op.getOperand(0), Or, VBias});

21080

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

21081

}

21082

21083

// The algorithm is the following:

21084

// #ifdef __SSE4_1__

21085

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21086

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21087

// (uint4) 0x53000000, 0xaa);

21088

// #else

21089

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21090

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21091

// #endif

21092

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

21093

// return (float4) lo + fhi;

21094

21095

bool Is128 = VecIntVT == MVT::v4i32;

21096

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

21097

// If we convert to something else than the supported type, e.g., to v4f64,

21098

// abort early.

21099

if (VecFloatVT != Op->getSimpleValueType(0))

21100

return SDValue();

21101

21102

// In the #idef/#else code, we have in common:

21103

// - The vector of constants:

21104

// -- 0x4b000000

21105

// -- 0x53000000

21106

// - A shift:

21107

// -- v >> 16

21108

21109

// Create the splat vector for 0x4b000000.

21110

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

21111

// Create the splat vector for 0x53000000.

21112

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

21113

21114

// Create the right shift.

21115

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

21116

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

21117

21118

SDValue Low, High;

21119

if (Subtarget.hasSSE41()) {

21120

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

21121

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21122

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

21123

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

21124

// Low will be bitcasted right away, so do not bother bitcasting back to its

21125

// original type.

21126

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

21127

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

21128

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21129

// (uint4) 0x53000000, 0xaa);

21130

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

21131

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

21132

// High will be bitcasted right away, so do not bother bitcasting back to

21133

// its original type.

21134

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

21135

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

21136

} else {

21137

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

21138

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21139

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

21140

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

21141

21142

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21143

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

21144

}

21145

21146

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

21147

SDValue VecCstFSub = DAG.getConstantFP(

21148

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

21149

21150

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

21151

// NOTE: By using fsub of a positive constant instead of fadd of a negative

21152

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

21153

// enabled. See PR24512.

21154

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

21155

// TODO: Are there any fast-math-flags to propagate here?

21156

// (float4) lo;

21157

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

21158

// return (float4) lo + fhi;

21159

if (IsStrict) {

21160

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

21161

{Op.getOperand(0), HighBitcast, VecCstFSub});

21162

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

21163

{FHigh.getValue(1), LowBitcast, FHigh});

21164

}

21165

21166

SDValue FHigh =

21167

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

21168

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

21169

}

21170

21171

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

21172

const X86Subtarget &Subtarget) {

21173

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21174

SDValue N0 = Op.getOperand(OpNo);

21175

MVT SrcVT = N0.getSimpleValueType();

21176

SDLoc dl(Op);

21177

21178

switch (SrcVT.SimpleTy) {

21179

default:

21180

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21180);

21181

case MVT::v2i32:

21182

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

21183

case MVT::v4i32:

21184

case MVT::v8i32:

21185

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

21186

case MVT::v2i64:

21187

case MVT::v4i64:

21188

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21189

}

21190

}

21191

21192

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

21193

SelectionDAG &DAG) const {

21194

bool IsStrict = Op->isStrictFPOpcode();

21195

unsigned OpNo = IsStrict ? 1 : 0;

21196

SDValue Src = Op.getOperand(OpNo);

21197

SDLoc dl(Op);

21198

auto PtrVT = getPointerTy(DAG.getDataLayout());

21199

MVT SrcVT = Src.getSimpleValueType();

21200

MVT DstVT = Op->getSimpleValueType(0);

21201

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

21202

21203

if (DstVT == MVT::f128)

21204

return SDValue();

21205

21206

if (DstVT.isVector())

21207

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

21208

21209

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21210

return LowerWin64_INT128_TO_FP(Op, DAG);

21211

21212

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21213

return Extract;

21214

21215

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

21216

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

21217

// Conversions from unsigned i32 to f32/f64 are legal,

21218

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

21219

return Op;

21220

}

21221

21222

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

21223

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

21224

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

21225

if (IsStrict)

21226

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

21227

{Chain, Src});

21228

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

21229

}

21230

21231

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21232

return V;

21233

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21234

return V;

21235

21236

// The transform for i64->f64 isn't correct for 0 when rounding to negative

21237

// infinity. It produces -0.0, so disable under strictfp.

21238

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

21239

!IsStrict)

21240

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

21241

// The transform for i32->f64/f32 isn't correct for 0 when rounding to

21242

// negative infinity. So disable under strictfp. Using FILD instead.

21243

if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

21244

!IsStrict)

21245

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

21246

if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

21247

(DstVT == MVT::f32 || DstVT == MVT::f64))

21248

return SDValue();

21249

21250

// Make a 64-bit buffer, and use it to build an FILD.

21251

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

21252

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

21253

Align SlotAlign(8);

21254

MachinePointerInfo MPI =

21255

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21256

if (SrcVT == MVT::i32) {

21257

SDValue OffsetSlot =

21258

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

21259

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

21260

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

21261

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

21262

std::pair<SDValue, SDValue> Tmp =

21263

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

21264

if (IsStrict)

21265

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21266

21267

return Tmp.first;

21268

}

21269

21270

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21270, __extension__
__PRETTY_FUNCTION__));

21271

SDValue ValueToStore = Src;

21272

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

21273

// Bitcasting to f64 here allows us to do a single 64-bit store from

21274

// an SSE register, avoiding the store forwarding penalty that would come

21275

// with two 32-bit stores.

21276

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21277

}

21278

SDValue Store =

21279

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

21280

// For i64 source, we need to add the appropriate power of 2 if the input

21281

// was negative. We must be careful to do the computation in x87 extended

21282

// precision, not in SSE.

21283

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

21284

SDValue Ops[] = { Store, StackSlot };

21285

SDValue Fild =

21286

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

21287

SlotAlign, MachineMemOperand::MOLoad);

21288

Chain = Fild.getValue(1);

21289

21290

21291

// Check whether the sign bit is set.

21292

SDValue SignSet = DAG.getSetCC(

21293

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

21294

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

21295

21296

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

21297

APInt FF(64, 0x5F80000000000000ULL);

21298

SDValue FudgePtr = DAG.getConstantPool(

21299

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

21300

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

21301

21302

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

21303

SDValue Zero = DAG.getIntPtrConstant(0, dl);

21304

SDValue Four = DAG.getIntPtrConstant(4, dl);

21305

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

21306

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

21307

21308

// Load the value out, extending it from f32 to f80.

21309

SDValue Fudge = DAG.getExtLoad(

21310

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

21311

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

21312

CPAlignment);

21313

Chain = Fudge.getValue(1);

21314

// Extend everything to 80 bits to force it to be done on x87.

21315

// TODO: Are there any fast-math-flags to propagate here?

21316

if (IsStrict) {

21317

SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},

21318

{Chain, Fild, Fudge});

21319

// STRICT_FP_ROUND can't handle equal types.

21320

if (DstVT == MVT::f80)

21321

return Add;

21322

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

21323

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

21324

}

21325

SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);

21326

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

21327

DAG.getIntPtrConstant(0, dl));

21328

}

21329

21330

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

21331

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

21332

// just return an SDValue().

21333

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

21334

// to i16, i32 or i64, and we lower it to a legal sequence and return the

21335

// result.

21336

SDValue

21337

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

21338

bool IsSigned, SDValue &Chain) const {

21339

bool IsStrict = Op->isStrictFPOpcode();

21340

SDLoc DL(Op);

21341

21342

EVT DstTy = Op.getValueType();

21343

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

21344

EVT TheVT = Value.getValueType();

21345

auto PtrVT = getPointerTy(DAG.getDataLayout());

21346

21347

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

21348

// f16 must be promoted before using the lowering in this routine.

21349

// fp128 does not use this lowering.

21350

return SDValue();

21351

}

21352

21353

// If using FIST to compute an unsigned i64, we'll need some fixup

21354

// to handle values above the maximum signed i64. A FIST is always

21355

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

21356

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

21357

21358

// FIXME: This does not generate an invalid exception if the input does not

21359

// fit in i32. PR44019

21360

if (!IsSigned && DstTy != MVT::i64) {

21361

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

21362

// The low 32 bits of the fist result will have the correct uint32 result.

21363

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21363, __extension__
__PRETTY_FUNCTION__));

21364

DstTy = MVT::i64;

21365

}

21366

21367

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))

21368

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))

21369

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__));

21370

21371

// We lower FP->int64 into FISTP64 followed by a load from a temporary

21372

// stack slot.

21373

MachineFunction &MF = DAG.getMachineFunction();

21374

unsigned MemSize = DstTy.getStoreSize();

21375

int SSFI =

21376

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

21377

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21378

21379

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

21380

21381

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

21382

21383

if (UnsignedFixup) {

21384

//

21385

// Conversion to unsigned i64 is implemented with a select,

21386

// depending on whether the source value fits in the range

21387

// of a signed i64. Let Thresh be the FP equivalent of

21388

// 0x8000000000000000ULL.

21389

//

21390

// Adjust = (Value >= Thresh) ? 0x80000000 : 0;

21391

// FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

21392

// FistSrc = (Value - FltOfs);

21393

// Fist-to-mem64 FistSrc

21394

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

21395

// to XOR'ing the high 32 bits with Adjust.

21396

//

21397

// Being a power of 2, Thresh is exactly representable in all FP formats.

21398

// For X87 we'd like to use the smallest FP type for this constant, but

21399

// for DAG type consistency we have to match the FP operand type.

21400

21401

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

21402

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

21403

bool LosesInfo = false;

21404

if (TheVT == MVT::f64)

21405

// The rounding mode is irrelevant as the conversion should be exact.

21406

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

21407

&LosesInfo);

21408

else if (TheVT == MVT::f80)

21409

Status = Thresh.convert(APFloat::x87DoubleExtended(),

21410

APFloat::rmNearestTiesToEven, &LosesInfo);

21411

21412

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21413, __extension__
__PRETTY_FUNCTION__))

21413

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21413, __extension__
__PRETTY_FUNCTION__));

21414

21415

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

21416

21417

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

21418

*DAG.getContext(), TheVT);

21419

SDValue Cmp;

21420

if (IsStrict) {

21421

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

21422

/*IsSignaling*/ true);

21423

Chain = Cmp.getValue(1);

21424

} else {

21425

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

21426

}

21427

21428

// Our preferred lowering of

21429

//

21430

// (Value >= Thresh) ? 0x8000000000000000ULL : 0

21431

//

21432

// is

21433

//

21434

// (Value >= Thresh) << 63

21435

//

21436

// but since we can get here after LegalOperations, DAGCombine might do the

21437

// wrong thing if we create a select. So, directly create the preferred

21438

// version.

21439

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

21440

SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

21441

Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);

21442

21443

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

21444

DAG.getConstantFP(0.0, DL, TheVT));

21445

21446

if (IsStrict) {

21447

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

21448

{ Chain, Value, FltOfs });

21449

Chain = Value.getValue(1);

21450

} else

21451

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

21452

}

21453

21454

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

21455

21456

// FIXME This causes a redundant load/store if the SSE-class value is already

21457

// in memory, such as if it is on the callstack.

21458

if (isScalarFPTypeInSSEReg(TheVT)) {

21459

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21459, __extension__
__PRETTY_FUNCTION__));

21460

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

21461

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

21462

SDValue Ops[] = { Chain, StackSlot };

21463

21464

unsigned FLDSize = TheVT.getStoreSize();

21465

assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21465, __extension__
__PRETTY_FUNCTION__));

21466

MachineMemOperand *MMO = MF.getMachineMemOperand(

21467

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

21468

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

21469

Chain = Value.getValue(1);

21470

}

21471

21472

// Build the FP_TO_INT*_IN_MEM

21473

MachineMemOperand *MMO = MF.getMachineMemOperand(

21474

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

21475

SDValue Ops[] = { Chain, Value, StackSlot };

21476

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

21477

DAG.getVTList(MVT::Other),

21478

Ops, DstTy, MMO);

21479

21480

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

21481

Chain = Res.getValue(1);

21482

21483

// If we need an unsigned fixup, XOR the result with adjust.

21484

if (UnsignedFixup)

21485

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

21486

21487

return Res;

21488

}

21489

21490

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

21491

const X86Subtarget &Subtarget) {

21492

MVT VT = Op.getSimpleValueType();

21493

SDValue In = Op.getOperand(0);

21494

MVT InVT = In.getSimpleValueType();

21495

SDLoc dl(Op);

21496

unsigned Opc = Op.getOpcode();

21497

21498

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21498, __extension__
__PRETTY_FUNCTION__));

21499

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21500, __extension__
__PRETTY_FUNCTION__))

21500

"Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21500, __extension__
__PRETTY_FUNCTION__));

21501

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21502, __extension__
__PRETTY_FUNCTION__))

21502

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21502, __extension__
__PRETTY_FUNCTION__));

21503

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))

21504

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))

21505

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__))

21506

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21506, __extension__
__PRETTY_FUNCTION__));

21507

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))

21508

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))

21509

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))

21510

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__));

21511

21512

unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

21513

21514

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

21515

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21515, __extension__
__PRETTY_FUNCTION__));

21516

return splitVectorIntUnary(Op, DAG);

21517

}

21518

21519

if (Subtarget.hasInt256())

21520

return Op;

21521

21522

// Optimize vectors in AVX mode:

21523

//

21524

// v8i16 -> v8i32

21525

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

21526

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

21527

// Concat upper and lower parts.

21528

//

21529

// v4i32 -> v4i64

21530

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

21531

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

21532

// Concat upper and lower parts.

21533

//

21534

MVT HalfVT = VT.getHalfNumVectorElementsVT();

21535

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

21536

21537

// Short-circuit if we can determine that each 128-bit half is the same value.

21538

// Otherwise, this is difficult to match and optimize.

21539

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

21540

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

21541

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

21542

21543

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

21544

SDValue Undef = DAG.getUNDEF(InVT);

21545

bool NeedZero = Opc == ISD::ZERO_EXTEND;

21546

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

21547

OpHi = DAG.getBitcast(HalfVT, OpHi);

21548

21549

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

21550

}

21551

21552

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

21553

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

21554

const SDLoc &dl, SelectionDAG &DAG) {

21555

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21555, __extension__
__PRETTY_FUNCTION__));

21556

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

21557

DAG.getIntPtrConstant(0, dl));

21558

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

21559

DAG.getIntPtrConstant(8, dl));

21560

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

21561

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

21562

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

21563

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

21564

}

21565

21566

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

21567

const X86Subtarget &Subtarget,

21568

SelectionDAG &DAG) {

21569

MVT VT = Op->getSimpleValueType(0);

21570

SDValue In = Op->getOperand(0);

21571

MVT InVT = In.getSimpleValueType();

21572

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21572, __extension__
__PRETTY_FUNCTION__));

21573

SDLoc DL(Op);

21574

unsigned NumElts = VT.getVectorNumElements();

21575

21576

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

21577

// avoids a constant pool load.

21578

if (VT.getVectorElementType() != MVT::i8) {

21579

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

21580

return DAG.getNode(ISD::SRL, DL, VT, Extend,

21581

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

21582

}

21583

21584

// Extend VT if BWI is not supported.

21585

MVT ExtVT = VT;

21586

if (!Subtarget.hasBWI()) {

21587

// If v16i32 is to be avoided, we'll need to split and concatenate.

21588

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

21589

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

21590

21591

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

21592

}

21593

21594

// Widen to 512-bits if VLX is not supported.

21595

MVT WideVT = ExtVT;

21596

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

21597

NumElts *= 512 / ExtVT.getSizeInBits();

21598

InVT = MVT::getVectorVT(MVT::i1, NumElts);

21599

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

21600

In, DAG.getIntPtrConstant(0, DL));

21601

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

21602

NumElts);

21603

}

21604

21605

SDValue One = DAG.getConstant(1, DL, WideVT);

21606

SDValue Zero = DAG.getConstant(0, DL, WideVT);

21607

21608

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

21609

21610

// Truncate if we had to extend above.

21611

if (VT != ExtVT) {

21612

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

21613

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

21614

}

21615

21616

// Extract back to 128/256-bit if we widened.

21617

if (WideVT != VT)

21618

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

21619

DAG.getIntPtrConstant(0, DL));

21620

21621

return SelectedVal;

21622

}

21623

21624

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

21625

SelectionDAG &DAG) {

21626

SDValue In = Op.getOperand(0);

21627

MVT SVT = In.getSimpleValueType();

21628

21629

if (SVT.getVectorElementType() == MVT::i1)

21630

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

21631

21632

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21632, __extension__
__PRETTY_FUNCTION__));

21633

return LowerAVXExtend(Op, DAG, Subtarget);

21634

}

21635

21636

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

21637

/// It makes use of the fact that vectors with enough leading sign/zero bits

21638

/// prevent the PACKSS/PACKUS from saturating the results.

21639

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

21640

/// within each 128-bit lane.

21641

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

21642

const SDLoc &DL, SelectionDAG &DAG,

21643

const X86Subtarget &Subtarget) {

21644

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21645, __extension__
__PRETTY_FUNCTION__))

21645

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21645, __extension__
__PRETTY_FUNCTION__));

21646

assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21646, __extension__
__PRETTY_FUNCTION__));

21647

21648

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

21649

if (!Subtarget.hasSSE2())

21650

return SDValue();

21651

21652

EVT SrcVT = In.getValueType();

21653

21654

// No truncation required, we might get here due to recursive calls.

21655

if (SrcVT == DstVT)

21656

return In;

21657

21658

// We only support vector truncation to 64bits or greater from a

21659

// 128bits or greater source.

21660

unsigned DstSizeInBits = DstVT.getSizeInBits();

21661

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

21662

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

21663

return SDValue();

21664

21665

unsigned NumElems = SrcVT.getVectorNumElements();

21666

if (!isPowerOf2_32(NumElems))

21667

return SDValue();

21668

21669

LLVMContext &Ctx = *DAG.getContext();

21670

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21670, __extension__
__PRETTY_FUNCTION__));

21671

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21671, __extension__
__PRETTY_FUNCTION__));

21672

21673

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

21674

21675

// Pack to the largest type possible:

21676

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

21677

EVT InVT = MVT::i16, OutVT = MVT::i8;

21678

if (SrcVT.getScalarSizeInBits() > 16 &&

21679

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

21680

InVT = MVT::i32;

21681

OutVT = MVT::i16;

21682

}

21683

21684

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

21685

if (SrcVT.is128BitVector()) {

21686

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

21687

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

21688

In = DAG.getBitcast(InVT, In);

21689

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

21690

Res = extractSubVector(Res, 0, DAG, DL, 64);

21691

return DAG.getBitcast(DstVT, Res);

21692

}

21693

21694

// Split lower/upper subvectors.

21695

SDValue Lo, Hi;

21696

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

21697

21698

unsigned SubSizeInBits = SrcSizeInBits / 2;

21699

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

21700

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

21701

21702

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

21703

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

21704

Lo = DAG.getBitcast(InVT, Lo);

21705

Hi = DAG.getBitcast(InVT, Hi);

21706

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

21707

return DAG.getBitcast(DstVT, Res);

21708

}

21709

21710

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

21711

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

21712

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

21713

Lo = DAG.getBitcast(InVT, Lo);

21714

Hi = DAG.getBitcast(InVT, Hi);

21715

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

21716

21717

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

21718

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

21719

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

21720

SmallVector<int, 64> Mask;

21721

int Scale = 64 / OutVT.getScalarSizeInBits();

21722

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

21723

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

21724

21725

if (DstVT.is256BitVector())

21726

return DAG.getBitcast(DstVT, Res);

21727

21728

// If 512bit -> 128bit truncate another stage.

21729

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

21730

Res = DAG.getBitcast(PackedVT, Res);

21731

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

21732

}

21733

21734

// Recursively pack lower/upper subvectors, concat result and pack again.

21735

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21735, __extension__
__PRETTY_FUNCTION__));

21736

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

21737

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

21738

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

21739

21740

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

21741

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

21742

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

21743

}

21744

21745

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

21746

const X86Subtarget &Subtarget) {

21747

21748

SDLoc DL(Op);

21749

MVT VT = Op.getSimpleValueType();

21750

SDValue In = Op.getOperand(0);

21751

MVT InVT = In.getSimpleValueType();

21752

21753

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21753, __extension__
__PRETTY_FUNCTION__));

21754

21755

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

21756

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

21757

if (InVT.getScalarSizeInBits() <= 16) {

21758

if (Subtarget.hasBWI()) {

21759

// legal, will go to VPMOVB2M, VPMOVW2M

21760

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

21761

// We need to shift to get the lsb into sign position.

21762

// Shift packed bytes not supported natively, bitcast to word

21763

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

21764

In = DAG.getNode(ISD::SHL, DL, ExtVT,

21765

DAG.getBitcast(ExtVT, In),

21766

DAG.getConstant(ShiftInx, DL, ExtVT));

21767

In = DAG.getBitcast(InVT, In);

21768

}

21769

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

21770

In, ISD::SETGT);

21771

}

21772

// Use TESTD/Q, extended vector to packed dword/qword.

21773

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__))

21774

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__));

21775

unsigned NumElts = InVT.getVectorNumElements();

21776

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21776, __extension__
__PRETTY_FUNCTION__));

21777

// We need to change to a wider element type that we have support for.

21778

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

21779

// For 16 element vectors we extend to v16i32 unless we are explicitly

21780

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

21781

// we need to split into two 8 element vectors which we can extend to v8i32,

21782

// truncate and concat the results. There's an additional complication if

21783

// the original type is v16i8. In that case we can't split the v16i8

21784

// directly, so we need to shuffle high elements to low and use

21785

// sign_extend_vector_inreg.

21786

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

21787

SDValue Lo, Hi;

21788

if (InVT == MVT::v16i8) {

21789

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

21790

Hi = DAG.getVectorShuffle(

21791

InVT, DL, In, In,

21792

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

21793

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

21794

} else {

21795

assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21795, __extension__
__PRETTY_FUNCTION__));

21796

Lo = extract128BitVector(In, 0, DAG, DL);

21797

Hi = extract128BitVector(In, 8, DAG, DL);

21798

}

21799

// We're split now, just emit two truncates and a concat. The two

21800

// truncates will trigger legalization to come back to this function.

21801

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

21802

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

21803

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

21804

}

21805

// We either have 8 elements or we're allowed to use 512-bit vectors.

21806

// If we have VLX, we want to use the narrowest vector that can get the

21807

// job done so we use vXi32.

21808

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

21809

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

21810

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

21811

InVT = ExtVT;

21812

ShiftInx = InVT.getScalarSizeInBits() - 1;

21813

}

21814

21815

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

21816

// We need to shift to get the lsb into sign position.

21817

In = DAG.getNode(ISD::SHL, DL, InVT, In,

21818

DAG.getConstant(ShiftInx, DL, InVT));

21819

}

21820

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

21821

if (Subtarget.hasDQI())

21822

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

21823

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

21824

}

21825

21826

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

21827

SDLoc DL(Op);

21828

MVT VT = Op.getSimpleValueType();

21829

SDValue In = Op.getOperand(0);

21830

MVT InVT = In.getSimpleValueType();

21831

unsigned InNumEltBits = InVT.getScalarSizeInBits();

21832

21833

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21834, __extension__
__PRETTY_FUNCTION__))

21834

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21834, __extension__
__PRETTY_FUNCTION__));

21835

21836

// If we're called by the type legalizer, handle a few cases.

21837

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

21838

if (!TLI.isTypeLegal(InVT)) {

21839

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

21840

VT.is128BitVector()) {

21841

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21842, __extension__
__PRETTY_FUNCTION__))

21842

"Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21842, __extension__
__PRETTY_FUNCTION__));

21843

// The default behavior is to truncate one step, concatenate, and then

21844

// truncate the remainder. We'd rather produce two 64-bit results and

21845

// concatenate those.

21846

SDValue Lo, Hi;

21847

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

21848

21849

EVT LoVT, HiVT;

21850

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

21851

21852

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

21853

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

21854

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

21855

}

21856

21857

// Otherwise let default legalization handle it.

21858

return SDValue();

21859

}

21860

21861

if (VT.getVectorElementType() == MVT::i1)

21862

return LowerTruncateVecI1(Op, DAG, Subtarget);

21863

21864

// vpmovqb/w/d, vpmovdb/w, vpmovwb

21865

if (Subtarget.hasAVX512()) {

21866

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

21867

assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21867, __extension__
__PRETTY_FUNCTION__));

21868

return splitVectorIntUnary(Op, DAG);

21869

}

21870

21871

// word to byte only under BWI. Otherwise we have to promoted to v16i32

21872

// and then truncate that. But we should only do that if we haven't been

21873

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

21874

// handled by isel patterns.

21875

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

21876

Subtarget.canExtendTo512DQ())

21877

return Op;

21878

}

21879

21880

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

21881

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

21882

21883

// Truncate with PACKUS if we are truncating a vector with leading zero bits

21884

// that extend all the way to the packed/truncated value.

21885

// Pre-SSE41 we can only use PACKUSWB.

21886

KnownBits Known = DAG.computeKnownBits(In);

21887

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

21888

if (SDValue V =

21889

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

21890

return V;

21891

21892

// Truncate with PACKSS if we are truncating a vector with sign-bits that

21893

// extend all the way to the packed/truncated value.

21894

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

21895

if (SDValue V =

21896

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

21897

return V;

21898

21899

// Handle truncation of V256 to V128 using shuffles.

21900

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21900, __extension__
__PRETTY_FUNCTION__));

21901

21902

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

21903

In = DAG.getBitcast(MVT::v8i32, In);

21904

21905

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

21906

if (Subtarget.hasInt256()) {

21907

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

21908

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

21909

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

21910

DAG.getIntPtrConstant(0, DL));

21911

}

21912

21913

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

21914

DAG.getIntPtrConstant(0, DL));

21915

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

21916

DAG.getIntPtrConstant(4, DL));

21917

static const int ShufMask[] = {0, 2, 4, 6};

21918

return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);

21919

}

21920

21921

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

21922

In = DAG.getBitcast(MVT::v32i8, In);

21923

21924

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

21925

if (Subtarget.hasInt256()) {

21926

// The PSHUFB mask:

21927

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

21928

-1, -1, -1, -1, -1, -1, -1, -1,

21929

16, 17, 20, 21, 24, 25, 28, 29,

21930

-1, -1, -1, -1, -1, -1, -1, -1 };

21931

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

21932

In = DAG.getBitcast(MVT::v4i64, In);

21933

21934

static const int ShufMask2[] = {0, 2, -1, -1};

21935

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

21936

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,

21937

DAG.getBitcast(MVT::v16i16, In),

21938

DAG.getIntPtrConstant(0, DL));

21939

}

21940

21941

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,

21942

DAG.getIntPtrConstant(0, DL));

21943

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,

21944

DAG.getIntPtrConstant(16, DL));

21945

21946

// The PSHUFB mask:

21947

static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,

21948

-1, -1, -1, -1, -1, -1, -1, -1};

21949

21950

OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);

21951

OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

21952

21953

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

21954

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

21955

21956

// The MOVLHPS Mask:

21957

static const int ShufMask2[] = {0, 1, 4, 5};

21958

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

21959

return DAG.getBitcast(MVT::v8i16, res);

21960

}

21961

21962

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

21963

// Use an AND to zero uppper bits for PACKUS.

21964

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

21965

21966

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

21967

DAG.getIntPtrConstant(0, DL));

21968

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

21969

DAG.getIntPtrConstant(8, DL));

21970

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

21971

}

21972

21973

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21973);

21974

}

21975

21976

// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

21977

// behaves on out of range inputs to generate optimized conversions.

21978

static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

21979

SelectionDAG &DAG,

21980

const X86Subtarget &Subtarget) {

21981

MVT SrcVT = Src.getSimpleValueType();

21982

unsigned DstBits = VT.getScalarSizeInBits();

21983

assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21983, __extension__
__PRETTY_FUNCTION__));

21984

21985

// Calculate the converted result for values in the range 0 to

21986

// 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

21987

SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

21988

SDValue Big =

21989

DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

21990

DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

21991

DAG.getConstantFP(2147483648.0f, dl, SrcVT)));

21992

21993

// The "CVTTP2SI" instruction conveniently sets the sign bit if

21994

// and only if the value was out of range. So we can use that

21995

// as our indicator that we rather use "Big" instead of "Small".

21996

//

21997

// Use "Small" if "IsOverflown" has all bits cleared

21998

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

21999

22000

// AVX1 can't use the signsplat masking for 256-bit vectors - we have to

22001

// use the slightly slower blendv select instead.

22002

if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

22003

SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

22004

return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

22005

}

22006

22007

SDValue IsOverflown =

22008

DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

22009

DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

22010

return DAG.getNode(ISD::OR, dl, VT, Small,

22011

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22012

}

22013

22014

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

22015

bool IsStrict = Op->isStrictFPOpcode();

22016

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

22017

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

22018

MVT VT = Op->getSimpleValueType(0);

22019

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22020

SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

22021

MVT SrcVT = Src.getSimpleValueType();

22022

SDLoc dl(Op);

22023

22024

SDValue Res;

22025

if (VT.isVector()) {

22026

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

22027

MVT ResVT = MVT::v4i32;

22028

MVT TruncVT = MVT::v4i1;

22029

unsigned Opc;

22030

if (IsStrict)

22031

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

22032

else

22033

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

22034

22035

if (!IsSigned && !Subtarget.hasVLX()) {

22036

assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22036, __extension__
__PRETTY_FUNCTION__));

22037

// Widen to 512-bits.

22038

ResVT = MVT::v8i32;

22039

TruncVT = MVT::v8i1;

22040

Opc = Op.getOpcode();

22041

// Need to concat with zero vector for strict fp to avoid spurious

22042

// exceptions.

22043

// TODO: Should we just do this for non-strict as well?

22044

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

22045

: DAG.getUNDEF(MVT::v8f64);

22046

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

22047

DAG.getIntPtrConstant(0, dl));

22048

}

22049

if (IsStrict) {

22050

Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

22051

Chain = Res.getValue(1);

22052

} else {

22053

Res = DAG.getNode(Opc, dl, ResVT, Src);

22054

}

22055

22056

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

22057

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

22058

DAG.getIntPtrConstant(0, dl));

22059

if (IsStrict)

22060

return DAG.getMergeValues({Res, Chain}, dl);

22061

return Res;

22062

}

22063

22064

if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

22065

if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)

22066

return Op;

22067

22068

MVT ResVT = VT;

22069

MVT EleVT = VT.getVectorElementType();

22070

if (EleVT != MVT::i64)

22071

ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

22072

22073

if (SrcVT != MVT::v8f16) {

22074

SDValue Tmp =

22075

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

22076

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

22077

Ops[0] = Src;

22078

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

22079

}

22080

22081

if (IsStrict) {

22082

Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

22083

: X86ISD::STRICT_CVTTP2UI,

22084

dl, {ResVT, MVT::Other}, {Chain, Src});

22085

Chain = Res.getValue(1);

22086

} else {

22087

Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

22088

ResVT, Src);

22089

}

22090

22091

// TODO: Need to add exception check code for strict FP.

22092

if (EleVT.getSizeInBits() < 16) {

22093

ResVT = MVT::getVectorVT(EleVT, 8);

22094

Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

22095

}

22096

22097

if (ResVT != VT)

22098

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22099

DAG.getIntPtrConstant(0, dl));

22100

22101

if (IsStrict)

22102

return DAG.getMergeValues({Res, Chain}, dl);

22103

return Res;

22104

}

22105

22106

if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {

22107

if (IsStrict) {

22108

Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

22109

: ISD::STRICT_FP_TO_UINT,

22110

dl, {MVT::v8i32, MVT::Other}, {Chain, Src});

22111

Chain = Res.getValue(1);

22112

} else {

22113

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

22114

MVT::v8i32, Src);

22115

}

22116

22117

// TODO: Need to add exception check code for strict FP.

22118

Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);

22119

22120

if (IsStrict)

22121

return DAG.getMergeValues({Res, Chain}, dl);

22122

return Res;

22123

}

22124

22125

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

22126

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

22127

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22127, __extension__
__PRETTY_FUNCTION__));

22128

assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22128, __extension__
__PRETTY_FUNCTION__));

22129

return Op;

22130

}

22131

22132

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

22133

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

22134

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

22135

Subtarget.useAVX512Regs()) {

22136

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22136, __extension__
__PRETTY_FUNCTION__));

22137

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22137, __extension__
__PRETTY_FUNCTION__));

22138

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

22139

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

22140

// Need to concat with zero vector for strict fp to avoid spurious

22141

// exceptions.

22142

// TODO: Should we just do this for non-strict as well?

22143

SDValue Tmp =

22144

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

22145

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

22146

DAG.getIntPtrConstant(0, dl));

22147

22148

if (IsStrict) {

22149

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

22150

{Chain, Src});

22151

Chain = Res.getValue(1);

22152

} else {

22153

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

22154

}

22155

22156

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22157

DAG.getIntPtrConstant(0, dl));

22158

22159

if (IsStrict)

22160

return DAG.getMergeValues({Res, Chain}, dl);

22161

return Res;

22162

}

22163

22164

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

22165

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

22166

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

22167

Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

22168

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22168, __extension__
__PRETTY_FUNCTION__));

22169

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

22170

// Need to concat with zero vector for strict fp to avoid spurious

22171

// exceptions.

22172

// TODO: Should we just do this for non-strict as well?

22173

SDValue Tmp =

22174

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

22175

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

22176

DAG.getIntPtrConstant(0, dl));

22177

22178

if (IsStrict) {

22179

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

22180

{Chain, Src});

22181

Chain = Res.getValue(1);

22182

} else {

22183

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

22184

}

22185

22186

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22187

DAG.getIntPtrConstant(0, dl));

22188

22189

if (IsStrict)

22190

return DAG.getMergeValues({Res, Chain}, dl);

22191

return Res;

22192

}

22193

22194

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

22195

if (!Subtarget.hasVLX()) {

22196

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

22197

// legalizer and then widened again by vector op legalization.

22198

if (!IsStrict)

22199

return SDValue();

22200

22201

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

22202

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

22203

{Src, Zero, Zero, Zero});

22204

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

22205

{Chain, Tmp});

22206

SDValue Chain = Tmp.getValue(1);

22207

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

22208

DAG.getIntPtrConstant(0, dl));

22209

return DAG.getMergeValues({Tmp, Chain}, dl);

22210

}

22211

22212

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22212, __extension__
__PRETTY_FUNCTION__));

22213

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

22214

DAG.getUNDEF(MVT::v2f32));

22215

if (IsStrict) {

22216

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

22217

: X86ISD::STRICT_CVTTP2UI;

22218

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

22219

}

22220

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

22221

return DAG.getNode(Opc, dl, VT, Tmp);

22222

}

22223

22224

// Generate optimized instructions for pre AVX512 unsigned conversions from

22225

// vXf32 to vXi32.

22226

if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

22227

(VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

22228

(VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

22229

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22229, __extension__
__PRETTY_FUNCTION__));

22230

return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

22231

}

22232

22233

return SDValue();

22234

}

22235

22236

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22236, __extension__ __PRETTY_FUNCTION__));

22237

22238

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

22239

22240

if (!IsSigned && UseSSEReg) {

22241

// Conversions from f32/f64 with AVX512 should be legal.

22242

if (Subtarget.hasAVX512())

22243

return Op;

22244

22245

// We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

22246

// behaves on out of range inputs to generate optimized conversions.

22247

if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

22248

(VT == MVT::i64 && Subtarget.is64Bit()))) {

22249

unsigned DstBits = VT.getScalarSizeInBits();

22250

APInt UIntLimit = APInt::getSignMask(DstBits);

22251

SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

22252

DAG.getConstant(UIntLimit, dl, VT));

22253

MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());

22254

22255

// Calculate the converted result for values in the range:

22256

// (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

22257

// (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

22258

SDValue Small =

22259

DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

22260

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

22261

SDValue Big = DAG.getNode(

22262

X86ISD::CVTTS2SI, dl, VT,

22263

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

22264

DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));

22265

22266

// The "CVTTS2SI" instruction conveniently sets the sign bit if

22267

// and only if the value was out of range. So we can use that

22268

// as our indicator that we rather use "Big" instead of "Small".

22269

//

22270

// Use "Small" if "IsOverflown" has all bits cleared

22271

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

22272

SDValue IsOverflown = DAG.getNode(

22273

ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

22274

return DAG.getNode(ISD::OR, dl, VT, Small,

22275

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22276

}

22277

22278

// Use default expansion for i64.

22279

if (VT == MVT::i64)

22280

return SDValue();

22281

22282

assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22282, __extension__
__PRETTY_FUNCTION__));

22283

22284

// Promote i32 to i64 and use a signed operation on 64-bit targets.

22285

// FIXME: This does not generate an invalid exception if the input does not

22286

// fit in i32. PR44019

22287

if (Subtarget.is64Bit()) {

22288

if (IsStrict) {

22289

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

22290

{Chain, Src});

22291

Chain = Res.getValue(1);

22292

} else

22293

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

22294

22295

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22296

if (IsStrict)

22297

return DAG.getMergeValues({Res, Chain}, dl);

22298

return Res;

22299

}

22300

22301

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

22302

// use fisttp which will be handled later.

22303

if (!Subtarget.hasSSE3())

22304

return SDValue();

22305

}

22306

22307

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

22308

// FIXME: This does not generate an invalid exception if the input does not

22309

// fit in i16. PR44019

22310

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

22311

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22311, __extension__
__PRETTY_FUNCTION__));

22312

if (IsStrict) {

22313

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

22314

{Chain, Src});

22315

Chain = Res.getValue(1);

22316

} else

22317

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

22318

22319

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22320

if (IsStrict)

22321

return DAG.getMergeValues({Res, Chain}, dl);

22322

return Res;

22323

}

22324

22325

// If this is a FP_TO_SINT using SSEReg we're done.

22326

if (UseSSEReg && IsSigned)

22327

return Op;

22328

22329

// fp128 needs to use a libcall.

22330

if (SrcVT == MVT::f128) {

22331

RTLIB::Libcall LC;

22332

if (IsSigned)

22333

LC = RTLIB::getFPTOSINT(SrcVT, VT);

22334

else

22335

LC = RTLIB::getFPTOUINT(SrcVT, VT);

22336

22337

MakeLibCallOptions CallOptions;

22338

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

22339

SDLoc(Op), Chain);

22340

22341

if (IsStrict)

22342

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

22343

22344

return Tmp.first;

22345

}

22346

22347

// Fall back to X87.

22348

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

22349

if (IsStrict)

22350

return DAG.getMergeValues({V, Chain}, dl);

22351

return V;

22352

}

22353

22354

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22354);

22355

}

22356

22357

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

22358

SelectionDAG &DAG) const {

22359

SDValue Src = Op.getOperand(0);

22360

MVT SrcVT = Src.getSimpleValueType();

22361

22362

// If the source is in an SSE register, the node is Legal.

22363

if (isScalarFPTypeInSSEReg(SrcVT))

22364

return Op;

22365

22366

return LRINT_LLRINTHelper(Op.getNode(), DAG);

22367

}

22368

22369

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

22370

SelectionDAG &DAG) const {

22371

EVT DstVT = N->getValueType(0);

22372

SDValue Src = N->getOperand(0);

22373

EVT SrcVT = Src.getValueType();

22374

22375

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

22376

// f16 must be promoted before using the lowering in this routine.

22377

// fp128 does not use this lowering.

22378

return SDValue();

22379

}

22380

22381

SDLoc DL(N);

22382

SDValue Chain = DAG.getEntryNode();

22383

22384

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

22385

22386

// If we're converting from SSE, the stack slot needs to hold both types.

22387

// Otherwise it only needs to hold the DstVT.

22388

EVT OtherVT = UseSSE ? SrcVT : DstVT;

22389

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

22390

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

22391

MachinePointerInfo MPI =

22392

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

22393

22394

if (UseSSE) {

22395

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22395, __extension__
__PRETTY_FUNCTION__));

22396

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

22397

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22398

SDValue Ops[] = { Chain, StackPtr };

22399

22400

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

22401

/*Align*/ None, MachineMemOperand::MOLoad);

22402

Chain = Src.getValue(1);

22403

}

22404

22405

SDValue StoreOps[] = { Chain, Src, StackPtr };

22406

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

22407

StoreOps, DstVT, MPI, /*Align*/ None,

22408

MachineMemOperand::MOStore);

22409

22410

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

22411

}

22412

22413

SDValue

22414

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

22415

// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

22416

// but making use of X86 specifics to produce better instruction sequences.

22417

SDNode *Node = Op.getNode();

22418

bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

22419

unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

22420

SDLoc dl(SDValue(Node, 0));

22421

SDValue Src = Node->getOperand(0);

22422

22423

// There are three types involved here: SrcVT is the source floating point

22424

// type, DstVT is the type of the result, and TmpVT is the result of the

22425

// intermediate FP_TO_*INT operation we'll use (which may be a promotion of

22426

// DstVT).

22427

EVT SrcVT = Src.getValueType();

22428

EVT DstVT = Node->getValueType(0);

22429

EVT TmpVT = DstVT;

22430

22431

// This code is only for floats and doubles. Fall back to generic code for

22432

// anything else.

22433

if (!isScalarFPTypeInSSEReg(SrcVT))

22434

return SDValue();

22435

22436

EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

22437

unsigned SatWidth = SatVT.getScalarSizeInBits();

22438

unsigned DstWidth = DstVT.getScalarSizeInBits();

22439

unsigned TmpWidth = TmpVT.getScalarSizeInBits();

22440

assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22441, __extension__
__PRETTY_FUNCTION__))

22441

"Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22441, __extension__
__PRETTY_FUNCTION__));

22442

22443

// Promote result of FP_TO_*INT to at least 32 bits.

22444

if (TmpWidth < 32) {

22445

TmpVT = MVT::i32;

22446

TmpWidth = 32;

22447

}

22448

22449

// Promote conversions to unsigned 32-bit to 64-bit, because it will allow

22450

// us to use a native signed conversion instead.

22451

if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

22452

TmpVT = MVT::i64;

22453

TmpWidth = 64;

22454

}

22455

22456

// If the saturation width is smaller than the size of the temporary result,

22457

// we can always use signed conversion, which is native.

22458

if (SatWidth < TmpWidth)

22459

FpToIntOpcode = ISD::FP_TO_SINT;

22460

22461

// Determine minimum and maximum integer values and their corresponding

22462

// floating-point values.

22463

APInt MinInt, MaxInt;

22464

if (IsSigned) {

22465

MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);

22466

MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);

22467

} else {

22468

MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);

22469

MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);

22470

}

22471

22472

APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));

22473

APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));

22474

22475

APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

22476

MinInt, IsSigned, APFloat::rmTowardZero);

22477

APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

22478

MaxInt, IsSigned, APFloat::rmTowardZero);

22479

bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

22480

&& !(MaxStatus & APFloat::opStatus::opInexact);

22481

22482

SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

22483

SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);

22484

22485

// If the integer bounds are exactly representable as floats, emit a

22486

// min+max+fptoi sequence. Otherwise use comparisons and selects.

22487

if (AreExactFloatBounds) {

22488

if (DstVT != TmpVT) {

22489

// Clamp by MinFloat from below. If Src is NaN, propagate NaN.

22490

SDValue MinClamped = DAG.getNode(

22491

X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

22492

// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

22493

SDValue BothClamped = DAG.getNode(

22494

X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

22495

// Convert clamped value to integer.

22496

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);

22497

22498

// NaN will become INDVAL, with the top bit set and the rest zero.

22499

// Truncation will discard the top bit, resulting in zero.

22500

return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

22501

}

22502

22503

// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

22504

SDValue MinClamped = DAG.getNode(

22505

X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

22506

// Clamp by MaxFloat from above. NaN cannot occur.

22507

SDValue BothClamped = DAG.getNode(

22508

X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

22509

// Convert clamped value to integer.

22510

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);

22511

22512

if (!IsSigned) {

22513

// In the unsigned case we're done, because we mapped NaN to MinFloat,

22514

// which is zero.

22515

return FpToInt;

22516

}

22517

22518

// Otherwise, select zero if Src is NaN.

22519

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

22520

return DAG.getSelectCC(

22521

dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

22522

}

22523

22524

SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

22525

SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);

22526

22527

// Result of direct conversion, which may be selected away.

22528

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);

22529

22530

if (DstVT != TmpVT) {

22531

// NaN will become INDVAL, with the top bit set and the rest zero.

22532

// Truncation will discard the top bit, resulting in zero.

22533

FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

22534

}

22535

22536

SDValue Select = FpToInt;

22537

// For signed conversions where we saturate to the same size as the

22538

// result type of the fptoi instructions, INDVAL coincides with integer

22539

// minimum, so we don't need to explicitly check it.

22540

if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

22541

// If Src ULT MinFloat, select MinInt. In particular, this also selects

22542

// MinInt if Src is NaN.

22543

Select = DAG.getSelectCC(

22544

dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

22545

}

22546

22547

// If Src OGT MaxFloat, select MaxInt.

22548

Select = DAG.getSelectCC(

22549

dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);

22550

22551

// In the unsigned case we are done, because we mapped NaN to MinInt, which

22552

// is already zero. The promoted case was already handled above.

22553

if (!IsSigned || DstVT != TmpVT) {

22554

return Select;

22555

}

22556

22557

// Otherwise, select 0 if Src is NaN.

22558

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

22559

return DAG.getSelectCC(

22560

dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

22561

}

22562

22563

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

22564

bool IsStrict = Op->isStrictFPOpcode();

22565

22566

SDLoc DL(Op);

22567

MVT VT = Op.getSimpleValueType();

22568

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

22569

MVT SVT = In.getSimpleValueType();

22570

22571

if (VT == MVT::f128)

22572

return SDValue();

22573

22574

if (VT == MVT::f80) {

22575

if (SVT == MVT::f16) {

22576

assert(Subtarget.hasFP16() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22576, __extension__
__PRETTY_FUNCTION__));

22577

RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);

22578

MakeLibCallOptions CallOptions;

22579

std::pair<SDValue, SDValue> Tmp =

22580

makeLibCall(DAG, LC, VT, In, CallOptions, DL,

22581

IsStrict ? Op.getOperand(0) : SDValue());

22582

if (IsStrict)

22583

return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);

22584

else

22585

return Tmp.first;

22586

}

22587

return Op;

22588

}

22589

22590

if (SVT.getVectorElementType() == MVT::f16) {

22591

assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && Subtarget
.hasVLX() && "Unexpected features!") ? void (0) : __assert_fail
("Subtarget.hasFP16() && Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__
__PRETTY_FUNCTION__));

22592

if (SVT == MVT::v2f16)

22593

In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

22594

DAG.getUNDEF(MVT::v2f16));

22595

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

22596

DAG.getUNDEF(MVT::v4f16));

22597

if (IsStrict)

22598

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

22599

{Op->getOperand(0), Res});

22600

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

22601

}

22602

22603

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22603, __extension__
__PRETTY_FUNCTION__));

22604

22605

SDValue Res =

22606

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

22607

if (IsStrict)

22608

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

22609

{Op->getOperand(0), Res});

22610

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

22611

}

22612

22613

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

22614

bool IsStrict = Op->isStrictFPOpcode();

22615

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

22616

MVT VT = Op.getSimpleValueType();

22617

MVT SVT = In.getSimpleValueType();

22618

22619

// It's legal except when f128 is involved or we're converting f80->f16.

22620

if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))

22621

return Op;

22622

22623

return SDValue();

22624

}

22625

22626

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

22627

bool IsStrict = Op->isStrictFPOpcode();

22628

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22629

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22630, __extension__
__PRETTY_FUNCTION__))

22630

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22630, __extension__
__PRETTY_FUNCTION__));

22631

22632

SDLoc dl(Op);

22633

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

22634

DAG.getConstant(0, dl, MVT::v8i16), Src,

22635

DAG.getIntPtrConstant(0, dl));

22636

22637

SDValue Chain;

22638

if (IsStrict) {

22639

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

22640

{Op.getOperand(0), Res});

22641

Chain = Res.getValue(1);

22642

} else {

22643

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

22644

}

22645

22646

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

22647

DAG.getIntPtrConstant(0, dl));

22648

22649

if (IsStrict)

22650

return DAG.getMergeValues({Res, Chain}, dl);

22651

22652

return Res;

22653

}

22654

22655

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

22656

bool IsStrict = Op->isStrictFPOpcode();

22657

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22658

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22659, __extension__
__PRETTY_FUNCTION__))

22659

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22659, __extension__
__PRETTY_FUNCTION__));

22660

22661

SDLoc dl(Op);

22662

SDValue Res, Chain;

22663

if (IsStrict) {

22664

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

22665

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

22666

DAG.getIntPtrConstant(0, dl));

22667

Res = DAG.getNode(

22668

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

22669

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

22670

Chain = Res.getValue(1);

22671

} else {

22672

// FIXME: Should we use zeros for upper elements for non-strict?

22673

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

22674

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

22675

DAG.getTargetConstant(4, dl, MVT::i32));

22676

}

22677

22678

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

22679

DAG.getIntPtrConstant(0, dl));

22680

22681

if (IsStrict)

22682

return DAG.getMergeValues({Res, Chain}, dl);

22683

22684

return Res;

22685

}

22686

22687

/// Depending on uarch and/or optimizing for size, we might prefer to use a

22688

/// vector operation in place of the typical scalar operation.

22689

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

22690

const X86Subtarget &Subtarget) {

22691

// If both operands have other uses, this is probably not profitable.

22692

SDValue LHS = Op.getOperand(0);

22693

SDValue RHS = Op.getOperand(1);

22694

if (!LHS.hasOneUse() && !RHS.hasOneUse())

22695

return Op;

22696

22697

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

22698

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

22699

if (IsFP && !Subtarget.hasSSE3())

22700

return Op;

22701

if (!IsFP && !Subtarget.hasSSSE3())

22702

return Op;

22703

22704

// Extract from a common vector.

22705

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

22706

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

22707

LHS.getOperand(0) != RHS.getOperand(0) ||

22708

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

22709

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

22710

!shouldUseHorizontalOp(true, DAG, Subtarget))

22711

return Op;

22712

22713

// Allow commuted 'hadd' ops.

22714

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

22715

unsigned HOpcode;

22716

switch (Op.getOpcode()) {

22717

case ISD::ADD: HOpcode = X86ISD::HADD; break;

22718

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

22719

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

22720

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

22721

default:

22722

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22722);

22723

}

22724

unsigned LExtIndex = LHS.getConstantOperandVal(1);

22725

unsigned RExtIndex = RHS.getConstantOperandVal(1);

22726

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

22727

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

22728

std::swap(LExtIndex, RExtIndex);

22729

22730

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

22731

return Op;

22732

22733

SDValue X = LHS.getOperand(0);

22734

EVT VecVT = X.getValueType();

22735

unsigned BitWidth = VecVT.getSizeInBits();

22736

unsigned NumLanes = BitWidth / 128;

22737

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

22738

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22739, __extension__
__PRETTY_FUNCTION__))

22739

"Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22739, __extension__
__PRETTY_FUNCTION__));

22740

22741

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

22742

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

22743

SDLoc DL(Op);

22744

if (BitWidth == 256 || BitWidth == 512) {

22745

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

22746

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

22747

LExtIndex %= NumEltsPerLane;

22748

}

22749

22750

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

22751

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

22752

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

22753

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

22754

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

22755

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

22756

DAG.getIntPtrConstant(LExtIndex / 2, DL));

22757

}

22758

22759

/// Depending on uarch and/or optimizing for size, we might prefer to use a

22760

/// vector operation in place of the typical scalar operation.

22761

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

22762

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22763, __extension__
__PRETTY_FUNCTION__))

22763

"Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22763, __extension__
__PRETTY_FUNCTION__));

22764

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

22765

}

22766

22767

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

22768

/// This mode isn't supported in hardware on X86. But as long as we aren't

22769

/// compiling with trapping math, we can emulate this with

22770

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).

22771

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

22772

SDValue N0 = Op.getOperand(0);

22773

SDLoc dl(Op);

22774

MVT VT = Op.getSimpleValueType();

22775

22776

// N0 += copysign(nextafter(0.5, 0.0), N0)

22777

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

22778

bool Ignored;

22779

APFloat Point5Pred = APFloat(0.5f);

22780

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

22781

Point5Pred.next(/*nextDown*/true);

22782

22783

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

22784

DAG.getConstantFP(Point5Pred, dl, VT), N0);

22785

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

22786

22787

// Truncate the result to remove fraction.

22788

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

22789

}

22790

22791

/// The only differences between FABS and FNEG are the mask and the logic op.

22792

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

22793

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

22794

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__))

22795

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__
__PRETTY_FUNCTION__));

22796

22797

bool IsFABS = (Op.getOpcode() == ISD::FABS);

22798

22799

// If this is a FABS and it has an FNEG user, bail out to fold the combination

22800

// into an FNABS. We'll lower the FABS after that if it is still in use.

22801

if (IsFABS)

22802

for (SDNode *User : Op->uses())

22803

if (User->getOpcode() == ISD::FNEG)

22804

return Op;

22805

22806

SDLoc dl(Op);

22807

MVT VT = Op.getSimpleValueType();

22808

22809

bool IsF128 = (VT == MVT::f128);

22810

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))

22811

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__))

22812

"Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22812, __extension__
__PRETTY_FUNCTION__));

22813

22814

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

22815

// decide if we should generate a 16-byte constant mask when we only need 4 or

22816

// 8 bytes for the scalar case.

22817

22818

// There are no scalar bitwise logical SSE/AVX instructions, so we

22819

// generate a 16-byte vector constant and logic op even for the scalar case.

22820

// Using a 16-byte mask allows folding the load of the mask with

22821

// the logic op, so it can save (~4 bytes) on code size.

22822

bool IsFakeVector = !VT.isVector() && !IsF128;

22823

MVT LogicVT = VT;

22824

if (IsFakeVector)

22825

LogicVT = (VT == MVT::f64) ? MVT::v2f64

22826

: (VT == MVT::f32) ? MVT::v4f32

22827

: MVT::v8f16;

22828

22829

unsigned EltBits = VT.getScalarSizeInBits();

22830

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

22831

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

22832

APInt::getSignMask(EltBits);

22833

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

22834

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

22835

22836

SDValue Op0 = Op.getOperand(0);

22837

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

22838

unsigned LogicOp = IsFABS ? X86ISD::FAND :

22839

IsFNABS ? X86ISD::FOR :

22840

X86ISD::FXOR;

22841

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

22842

22843

if (VT.isVector() || IsF128)

22844

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

22845

22846

// For the scalar case extend to a 128-bit vector, perform the logic op,

22847

// and extract the scalar result back out.

22848

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

22849

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

22850

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

22851

DAG.getIntPtrConstant(0, dl));

22852

}

22853

22854

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

22855

SDValue Mag = Op.getOperand(0);

22856

SDValue Sign = Op.getOperand(1);

22857

SDLoc dl(Op);

22858

22859

// If the sign operand is smaller, extend it first.

22860

MVT VT = Op.getSimpleValueType();

22861

if (Sign.getSimpleValueType().bitsLT(VT))

22862

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

22863

22864

// And if it is bigger, shrink it first.

22865

if (Sign.getSimpleValueType().bitsGT(VT))

22866

Sign =

22867

DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));

22868

22869

// At this point the operands and the result should have the same

22870

// type, and that won't be f80 since that is not custom lowered.

22871

bool IsF128 = (VT == MVT::f128);

22872

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))

22873

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__))

22874

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22874, __extension__
__PRETTY_FUNCTION__));

22875

22876

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

22877

22878

// Perform all scalar logic operations as 16-byte vectors because there are no

22879

// scalar FP logic instructions in SSE.

22880

// TODO: This isn't necessary. If we used scalar types, we might avoid some

22881

// unnecessary splats, but we might miss load folding opportunities. Should

22882

// this decision be based on OptimizeForSize?

22883

bool IsFakeVector = !VT.isVector() && !IsF128;

22884

MVT LogicVT = VT;

22885

if (IsFakeVector)

22886

LogicVT = (VT == MVT::f64) ? MVT::v2f64

22887

: (VT == MVT::f32) ? MVT::v4f32

22888

: MVT::v8f16;

22889

22890

// The mask constants are automatically splatted for vector types.

22891

unsigned EltSizeInBits = VT.getScalarSizeInBits();

22892

SDValue SignMask = DAG.getConstantFP(

22893

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

22894

SDValue MagMask = DAG.getConstantFP(

22895

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

22896

22897

// First, clear all bits but the sign bit from the second operand (sign).

22898

if (IsFakeVector)

22899

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

22900

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

22901

22902

// Next, clear the sign bit from the first operand (magnitude).

22903

// TODO: If we had general constant folding for FP logic ops, this check

22904

// wouldn't be necessary.

22905

SDValue MagBits;

22906

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

22907

APFloat APF = Op0CN->getValueAPF();

22908

APF.clearSign();

22909

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

22910

} else {

22911

// If the magnitude operand wasn't a constant, we need to AND out the sign.

22912

if (IsFakeVector)

22913

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

22914

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

22915

}

22916

22917

// OR the magnitude value with the sign bit.

22918

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

22919

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

22920

DAG.getIntPtrConstant(0, dl));

22921

}

22922

22923

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

22924

SDValue N0 = Op.getOperand(0);

22925

SDLoc dl(Op);

22926

MVT VT = Op.getSimpleValueType();

22927

22928

MVT OpVT = N0.getSimpleValueType();

22929

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22930, __extension__
__PRETTY_FUNCTION__))

22930

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22930, __extension__
__PRETTY_FUNCTION__));

22931

22932

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

22933

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

22934

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

22935

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

22936

Res = DAG.getZExtOrTrunc(Res, dl, VT);

22937

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

22938

return Res;

22939

}

22940

22941

/// Helper for attempting to create a X86ISD::BT node.

22942

static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

22943

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

22944

// instruction. Since the shift amount is in-range-or-undefined, we know

22945

// that doing a bittest on the i32 value is ok. We extend to i32 because

22946

// the encoding for the i16 version is larger than the i32 version.

22947

// Also promote i16 to i32 for performance / code size reason.

22948

if (Src.getValueType().getScalarSizeInBits() < 32)

22949

Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

22950

22951

// No legal type found, give up.

22952

if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

22953

return SDValue();

22954

22955

// See if we can use the 32-bit instruction instead of the 64-bit one for a

22956

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

22957

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

22958

// known to be zero.

22959

if (Src.getValueType() == MVT::i64 &&

22960

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

22961

Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);

22962

22963

// If the operand types disagree, extend the shift amount to match. Since

22964

// BT ignores high bits (like shifts) we can use anyextend.

22965

if (Src.getValueType() != BitNo.getValueType())

22966

BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

22967

22968

return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

22969

}

22970

22971

/// Helper for creating a X86ISD::SETCC node.

22972

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

22973

SelectionDAG &DAG) {

22974

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

22975

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

22976

}

22977

22978

/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))

22979

/// style scalarized (associative) reduction patterns. Partial reductions

22980

/// are supported when the pointer SrcMask is non-null.

22981

/// TODO - move this to SelectionDAG?

22982

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

22983

SmallVectorImpl<SDValue> &SrcOps,

22984

SmallVectorImpl<APInt> *SrcMask = nullptr) {

22985

SmallVector<SDValue, 8> Opnds;

22986

DenseMap<SDValue, APInt> SrcOpMap;

22987

EVT VT = MVT::Other;

22988

22989

// Recognize a special case where a vector is casted into wide integer to

22990

// test all 0s.

22991

assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22992, __extension__
__PRETTY_FUNCTION__))

22992

"Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22992, __extension__
__PRETTY_FUNCTION__));

22993

Opnds.push_back(Op.getOperand(0));

22994

Opnds.push_back(Op.getOperand(1));

22995

22996

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

22997

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

22998

// BFS traverse all BinOp operands.

22999

if (I->getOpcode() == unsigned(BinOp)) {

23000

Opnds.push_back(I->getOperand(0));

23001

Opnds.push_back(I->getOperand(1));

23002

// Re-evaluate the number of nodes to be traversed.

23003

e += 2; // 2 more nodes (LHS and RHS) are pushed.

23004

continue;

23005

}

23006

23007

// Quit if a non-EXTRACT_VECTOR_ELT

23008

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

23009

return false;

23010

23011

// Quit if without a constant index.

23012

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

23013

if (!Idx)

23014

return false;

23015

23016

SDValue Src = I->getOperand(0);

23017

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

23018

if (M == SrcOpMap.end()) {

23019

VT = Src.getValueType();

23020

// Quit if not the same type.

23021

if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

23022

return false;

23023

unsigned NumElts = VT.getVectorNumElements();

23024

APInt EltCount = APInt::getZero(NumElts);

23025

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

23026

SrcOps.push_back(Src);

23027

}

23028

23029

// Quit if element already used.

23030

unsigned CIdx = Idx->getZExtValue();

23031

if (M->second[CIdx])

23032

return false;

23033

M->second.setBit(CIdx);

23034

}

23035

23036

if (SrcMask) {

23037

// Collect the source partial masks.

23038

for (SDValue &SrcOp : SrcOps)

23039

SrcMask->push_back(SrcOpMap[SrcOp]);

23040

} else {

23041

// Quit if not all elements are used.

23042

for (const auto &I : SrcOpMap)

23043

if (!I.second.isAllOnes())

23044

return false;

23045

}

23046

23047

return true;

23048

}

23049

23050

// Helper function for comparing all bits of a vector against zero.

23051

static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,

23052

const APInt &Mask,

23053

const X86Subtarget &Subtarget,

23054

SelectionDAG &DAG, X86::CondCode &X86CC) {

23055

EVT VT = V.getValueType();

23056

unsigned ScalarSize = VT.getScalarSizeInBits();

23057

if (Mask.getBitWidth() != ScalarSize) {

23058

assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23058, __extension__
__PRETTY_FUNCTION__));

23059

return SDValue();

23060

}

23061

23062

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23062, __extension__
__PRETTY_FUNCTION__));

23063

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

23064

23065

auto MaskBits = [&](SDValue Src) {

23066

if (Mask.isAllOnes())

23067

return Src;

23068

EVT SrcVT = Src.getValueType();

23069

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

23070

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

23071

};

23072

23073

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

23074

if (VT.getSizeInBits() < 128) {

23075

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

23076

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))

23077

return SDValue();

23078

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

23079

DAG.getBitcast(IntVT, MaskBits(V)),

23080

DAG.getConstant(0, DL, IntVT));

23081

}

23082

23083

// Quit if not splittable to 128/256-bit vector.

23084

if (!isPowerOf2_32(VT.getSizeInBits()))

23085

return SDValue();

23086

23087

// Split down to 128/256-bit vector.

23088

unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;

23089

while (VT.getSizeInBits() > TestSize) {

23090

auto Split = DAG.SplitVector(V, DL);

23091

VT = Split.first.getValueType();

23092

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

23093

}

23094

23095

bool UsePTEST = Subtarget.hasSSE41();

23096

if (UsePTEST) {

23097

MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

23098

V = DAG.getBitcast(TestVT, MaskBits(V));

23099

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

23100

}

23101

23102

// Without PTEST, a masked v2i64 or-reduction is not faster than

23103

// scalarization.

23104

if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)

23105

return SDValue();

23106

23107

V = DAG.getBitcast(MVT::v16i8, MaskBits(V));

23108

V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,

23109

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

23110

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

23111

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

23112

DAG.getConstant(0xFFFF, DL, MVT::i32));

23113

}

23114

23115

// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to

23116

// CMP(MOVMSK(PCMPEQB(X,0))).

23117

static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,

23118

const SDLoc &DL,

23119

const X86Subtarget &Subtarget,

23120

SelectionDAG &DAG, SDValue &X86CC) {

23121

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23121, __extension__
__PRETTY_FUNCTION__));

23122

23123

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

23124

return SDValue();

23125

23126

// Check whether we're masking/truncating an OR-reduction result, in which

23127

// case track the masked bits.

23128

APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

23129

switch (Op.getOpcode()) {

23130

case ISD::TRUNCATE: {

23131

SDValue Src = Op.getOperand(0);

23132

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

23133

Op.getScalarValueSizeInBits());

23134

Op = Src;

23135

break;

23136

}

23137

case ISD::AND: {

23138

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

23139

Mask = Cst->getAPIntValue();

23140

Op = Op.getOperand(0);

23141

}

23142

break;

23143

}

23144

}

23145

23146

SmallVector<SDValue, 8> VecIns;

23147

if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {

23148

EVT VT = VecIns[0].getValueType();

23149

assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))

23150

[VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__))

23151

"Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23151, __extension__
__PRETTY_FUNCTION__));

23152

23153

// Quit if less than 128-bits or not splittable to 128/256-bit vector.

23154

if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))

23155

return SDValue();

23156

23157

// If more than one full vector is evaluated, OR them first before PTEST.

23158

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

23159

Slot += 2, e += 1) {

23160

// Each iteration will OR 2 nodes and append the result until there is

23161

// only 1 node left, i.e. the final OR'd value of all vectors.

23162

SDValue LHS = VecIns[Slot];

23163

SDValue RHS = VecIns[Slot + 1];

23164

VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));

23165

}

23166

23167

X86::CondCode CCode;

23168

if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,

23169

DAG, CCode)) {

23170

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

23171

return V;

23172

}

23173

}

23174

23175

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

23176

ISD::NodeType BinOp;

23177

if (SDValue Match =

23178

DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {

23179

X86::CondCode CCode;

23180

if (SDValue V =

23181

LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {

23182

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

23183

return V;

23184

}

23185

}

23186

}

23187

23188

return SDValue();

23189

}

23190

23191

/// return true if \c Op has a use that doesn't just read flags.

23192

static bool hasNonFlagsUse(SDValue Op) {

23193

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

23194

++UI) {

23195

SDNode *User = *UI;

23196

unsigned UOpNo = UI.getOperandNo();

23197

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

23198

// Look pass truncate.

23199

UOpNo = User->use_begin().getOperandNo();

23200

User = *User->use_begin();

23201

}

23202

23203

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

23204

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

23205

return true;

23206

}

23207

return false;

23208

}

23209

23210

// Transform to an x86-specific ALU node with flags if there is a chance of

23211

// using an RMW op or only the flags are used. Otherwise, leave

23212

// the node alone and emit a 'cmp' or 'test' instruction.

23213

static bool isProfitableToUseFlagOp(SDValue Op) {

23214

for (SDNode *U : Op->uses())

23215

if (U->getOpcode() != ISD::CopyToReg &&

23216

U->getOpcode() != ISD::SETCC &&

23217

U->getOpcode() != ISD::STORE)

23218

return false;

23219

23220

return true;

23221

}

23222

23223

/// Emit nodes that will be selected as "test Op0,Op0", or something

23224

/// equivalent.

23225

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

23226

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

23227

// CF and OF aren't always set the way we want. Determine which

23228

// of these we need.

23229

bool NeedCF = false;

23230

bool NeedOF = false;

23231

switch (X86CC) {

23232

default: break;

23233

case X86::COND_A: case X86::COND_AE:

23234

case X86::COND_B: case X86::COND_BE:

23235

NeedCF = true;

23236

break;

23237

case X86::COND_G: case X86::COND_GE:

23238

case X86::COND_L: case X86::COND_LE:

23239

case X86::COND_O: case X86::COND_NO: {

23240

// Check if we really need to set the

23241

// Overflow flag. If NoSignedWrap is present

23242

// that is not actually needed.

23243

switch (Op->getOpcode()) {

23244

case ISD::ADD:

23245

case ISD::SUB:

23246

case ISD::MUL:

23247

case ISD::SHL:

23248

if (Op.getNode()->getFlags().hasNoSignedWrap())

23249

break;

23250

LLVM_FALLTHROUGH[[gnu::fallthrough]];

23251

default:

23252

NeedOF = true;

23253

break;

23254

}

23255

break;

23256

}

23257

}

23258

// See if we can use the EFLAGS value from the operand instead of

23259

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

23260

// we prove that the arithmetic won't overflow, we can't use OF or CF.

23261

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

23262

// Emit a CMP with 0, which is the TEST pattern.

23263

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

23264

DAG.getConstant(0, dl, Op.getValueType()));

23265

}

23266

unsigned Opcode = 0;

23267

unsigned NumOperands = 0;

23268

23269

SDValue ArithOp = Op;

23270

23271

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

23272

// which may be the result of a CAST. We use the variable 'Op', which is the

23273

// non-casted variable when we check for possible users.

23274

switch (ArithOp.getOpcode()) {

23275

case ISD::AND:

23276

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

23277

// because a TEST instruction will be better.

23278

if (!hasNonFlagsUse(Op))

23279

break;

23280

23281

LLVM_FALLTHROUGH[[gnu::fallthrough]];

23282

case ISD::ADD:

23283

case ISD::SUB:

23284

case ISD::OR:

23285

case ISD::XOR:

23286

if (!isProfitableToUseFlagOp(Op))

23287

break;

23288

23289

// Otherwise use a regular EFLAGS-setting instruction.

23290

switch (ArithOp.getOpcode()) {

23291

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23291);

23292

case ISD::ADD: Opcode = X86ISD::ADD; break;

23293

case ISD::SUB: Opcode = X86ISD::SUB; break;

23294

case ISD::XOR: Opcode = X86ISD::XOR; break;

23295

case ISD::AND: Opcode = X86ISD::AND; break;

23296

case ISD::OR: Opcode = X86ISD::OR; break;

23297

}

23298

23299

NumOperands = 2;

23300

break;

23301

case X86ISD::ADD:

23302

case X86ISD::SUB:

23303

case X86ISD::OR:

23304

case X86ISD::XOR:

23305

case X86ISD::AND:

23306

return SDValue(Op.getNode(), 1);

23307

case ISD::SSUBO:

23308

case ISD::USUBO: {

23309

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

23310

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

23311

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

23312

Op->getOperand(1)).getValue(1);

23313

}

23314

default:

23315

break;

23316

}

23317

23318

if (Opcode == 0) {

23319

// Emit a CMP with 0, which is the TEST pattern.

23320

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

23321

DAG.getConstant(0, dl, Op.getValueType()));

23322

}

23323

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

23324

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

23325

23326

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

23327

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

23328

return SDValue(New.getNode(), 1);

23329

}

23330

23331

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

23332

/// equivalent.

23333

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

23334

const SDLoc &dl, SelectionDAG &DAG,

23335

const X86Subtarget &Subtarget) {

23336

if (isNullConstant(Op1))

23337

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

23338

23339

EVT CmpVT = Op0.getValueType();

23340

23341

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23342, __extension__
__PRETTY_FUNCTION__))

23342

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23342, __extension__
__PRETTY_FUNCTION__));

23343

23344

// Only promote the compare up to I32 if it is a 16 bit operation

23345

// with an immediate. 16 bit immediates are to be avoided.

23346

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

23347

!DAG.getMachineFunction().getFunction().hasMinSize()) {

23348

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

23349

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

23350

// Don't do this if the immediate can fit in 8-bits.

23351

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

23352

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

23353

unsigned ExtendOp =

23354

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

23355

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

23356

// For equality comparisons try to use SIGN_EXTEND if the input was

23357

// truncate from something with enough sign bits.

23358

if (Op0.getOpcode() == ISD::TRUNCATE) {

23359

if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

23360

ExtendOp = ISD::SIGN_EXTEND;

23361

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

23362

if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

23363

ExtendOp = ISD::SIGN_EXTEND;

23364

}

23365

}

23366

23367

CmpVT = MVT::i32;

23368

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

23369

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

23370

}

23371

}

23372

23373

// Try to shrink i64 compares if the input has enough zero bits.

23374

// FIXME: Do this for non-constant compares for constant on LHS?

23375

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

23376

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

23377

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

23378

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

23379

CmpVT = MVT::i32;

23380

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

23381

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

23382

}

23383

23384

// 0-x == y --> x+y == 0

23385

// 0-x != y --> x+y != 0

23386

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

23387

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

23388

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

23389

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

23390

return Add.getValue(1);

23391

}

23392

23393

// x == 0-y --> x+y == 0

23394

// x != 0-y --> x+y != 0

23395

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

23396

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

23397

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

23398

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

23399

return Add.getValue(1);

23400

}

23401

23402

// Use SUB instead of CMP to enable CSE between SUB and CMP.

23403

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

23404

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

23405

return Sub.getValue(1);

23406

}

23407

23408

/// Check if replacement of SQRT with RSQRT should be disabled.

23409

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

23410

EVT VT = Op.getValueType();

23411

23412

// We don't need to replace SQRT with RSQRT for half type.

23413

if (VT.getScalarType() == MVT::f16)

23414

return true;

23415

23416

// We never want to use both SQRT and RSQRT instructions for the same input.

23417

if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

23418

return false;

23419

23420

if (VT.isVector())

23421

return Subtarget.hasFastVectorFSQRT();

23422

return Subtarget.hasFastScalarFSQRT();

23423

}

23424

23425

/// The minimum architected relative accuracy is 2^-12. We need one

23426

/// Newton-Raphson step to have a good float result (24 bits of precision).

23427

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

23428

SelectionDAG &DAG, int Enabled,

23429

int &RefinementSteps,

23430

bool &UseOneConstNR,

23431

bool Reciprocal) const {

23432

SDLoc DL(Op);

23433

EVT VT = Op.getValueType();

23434

23435

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

23436

// It is likely not profitable to do this for f64 because a double-precision

23437

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

23438

// instructions: convert to single, rsqrtss, convert back to double, refine

23439

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

23440

// along with FMA, this could be a throughput win.

23441

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

23442

// after legalize types.

23443

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

23444

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

23445

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

23446

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

23447

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

23448

if (RefinementSteps == ReciprocalEstimate::Unspecified)

23449

RefinementSteps = 1;

23450

23451

UseOneConstNR = false;

23452

// There is no FSQRT for 512-bits, but there is RSQRT14.

23453

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

23454

SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

23455

if (RefinementSteps == 0 && !Reciprocal)

23456

Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

23457

return Estimate;

23458

}

23459

23460

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

23461

Subtarget.hasFP16()) {

23462

assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23462, __extension__
__PRETTY_FUNCTION__));

23463

if (RefinementSteps == ReciprocalEstimate::Unspecified)

23464

RefinementSteps = 0;

23465

23466

if (VT == MVT::f16) {

23467

SDValue Zero = DAG.getIntPtrConstant(0, DL);

23468

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

23469

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

23470

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

23471

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

23472

}

23473

23474

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

23475

}

23476

return SDValue();

23477

}

23478

23479

/// The minimum architected relative accuracy is 2^-12. We need one

23480

/// Newton-Raphson step to have a good float result (24 bits of precision).

23481

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

23482

int Enabled,

23483

int &RefinementSteps) const {

23484

SDLoc DL(Op);

23485

EVT VT = Op.getValueType();

23486

23487

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

23488

// It is likely not profitable to do this for f64 because a double-precision

23489

// reciprocal estimate with refinement on x86 prior to FMA requires

23490

// 15 instructions: convert to single, rcpss, convert back to double, refine

23491

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

23492

// along with FMA, this could be a throughput win.

23493

23494

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

23495

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

23496

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

23497

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

23498

// Enable estimate codegen with 1 refinement step for vector division.

23499

// Scalar division estimates are disabled because they break too much

23500

// real-world code. These defaults are intended to match GCC behavior.

23501

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

23502

return SDValue();

23503

23504

if (RefinementSteps == ReciprocalEstimate::Unspecified)

23505

RefinementSteps = 1;

23506

23507

// There is no FSQRT for 512-bits, but there is RCP14.

23508

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

23509

return DAG.getNode(Opcode, DL, VT, Op);

23510

}

23511

23512

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

23513

Subtarget.hasFP16()) {

23514

if (RefinementSteps == ReciprocalEstimate::Unspecified)

23515

RefinementSteps = 0;

23516

23517

if (VT == MVT::f16) {

23518

SDValue Zero = DAG.getIntPtrConstant(0, DL);

23519

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

23520

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

23521

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

23522

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

23523

}

23524

23525

return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

23526

}

23527

return SDValue();

23528

}

23529

23530

/// If we have at least two divisions that use the same divisor, convert to

23531

/// multiplication by a reciprocal. This may need to be adjusted for a given

23532

/// CPU if a division's cost is not at least twice the cost of a multiplication.

23533

/// This is because we still need one division to calculate the reciprocal and

23534

/// then we need two multiplies by that reciprocal as replacements for the

23535

/// original divisions.

23536

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

23537

return 2;

23538

}

23539

23540

SDValue

23541

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

23542

SelectionDAG &DAG,

23543

SmallVectorImpl<SDNode *> &Created) const {

23544

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

23545

if (isIntDivCheap(N->getValueType(0), Attr))

23546

return SDValue(N,0); // Lower SDIV as SDIV

23547

23548

assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23549, __extension__
__PRETTY_FUNCTION__))

23549

"Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23549, __extension__
__PRETTY_FUNCTION__));

23550

23551

// Only perform this transform if CMOV is supported otherwise the select

23552

// below will become a branch.

23553

if (!Subtarget.canUseCMOV())

23554

return SDValue();

23555

23556

// fold (sdiv X, pow2)

23557

EVT VT = N->getValueType(0);

23558

// FIXME: Support i8.

23559

if (VT != MVT::i16 && VT != MVT::i32 &&

23560

!(Subtarget.is64Bit() && VT == MVT::i64))

23561

return SDValue();

23562

23563

unsigned Lg2 = Divisor.countTrailingZeros();

23564

23565

// If the divisor is 2 or -2, the default expansion is better.

23566

if (Lg2 == 1)

23567

return SDValue();

23568

23569

SDLoc DL(N);

23570

SDValue N0 = N->getOperand(0);

23571

SDValue Zero = DAG.getConstant(0, DL, VT);

23572

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

23573

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

23574

23575

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

23576

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

23577

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

23578

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

23579

23580

Created.push_back(Cmp.getNode());

23581

Created.push_back(Add.getNode());

23582

Created.push_back(CMov.getNode());

23583

23584

// Divide by pow2.

23585

SDValue SRA =

23586

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

23587

23588

// If we're dividing by a positive value, we're done. Otherwise, we must

23589

// negate the result.

23590

if (Divisor.isNonNegative())

23591

return SRA;

23592

23593

Created.push_back(SRA.getNode());

23594

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

23595

}

23596

23597

/// Result of 'and' is compared against zero. Change to a BT node if possible.

23598

/// Returns the BT node and the condition code needed to use it.

23599

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

23600

SelectionDAG &DAG, X86::CondCode &X86CC) {

23601

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23601, __extension__
__PRETTY_FUNCTION__));

23602

SDValue Op0 = And.getOperand(0);

23603

SDValue Op1 = And.getOperand(1);

23604

if (Op0.getOpcode() == ISD::TRUNCATE)

23605

Op0 = Op0.getOperand(0);

23606

if (Op1.getOpcode() == ISD::TRUNCATE)

23607

Op1 = Op1.getOperand(0);

23608

23609

SDValue Src, BitNo;

23610

if (Op1.getOpcode() == ISD::SHL)

23611

std::swap(Op0, Op1);

23612

if (Op0.getOpcode() == ISD::SHL) {

23613

if (isOneConstant(Op0.getOperand(0))) {

23614

// If we looked past a truncate, check that it's only truncating away

23615

// known zeros.

23616

unsigned BitWidth = Op0.getValueSizeInBits();

23617

unsigned AndBitWidth = And.getValueSizeInBits();

23618

if (BitWidth > AndBitWidth) {

23619

KnownBits Known = DAG.computeKnownBits(Op0);

23620

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

23621

return SDValue();

23622

}

23623

Src = Op1;

23624

BitNo = Op0.getOperand(1);

23625

}

23626

} else if (Op1.getOpcode() == ISD::Constant) {

23627

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

23628

uint64_t AndRHSVal = AndRHS->getZExtValue();

23629

SDValue AndLHS = Op0;

23630

23631

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

23632

Src = AndLHS.getOperand(0);

23633

BitNo = AndLHS.getOperand(1);

23634

} else {

23635

// Use BT if the immediate can't be encoded in a TEST instruction or we

23636

// are optimizing for size and the immedaite won't fit in a byte.

23637

bool OptForSize = DAG.shouldOptForSize();

23638

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

23639

isPowerOf2_64(AndRHSVal)) {

23640

Src = AndLHS;

23641

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

23642

Src.getValueType());

23643

}

23644

}

23645

}

23646

23647

// No patterns found, give up.

23648

if (!Src.getNode())

23649

return SDValue();

23650

23651

// Remove any bit flip.

23652

if (isBitwiseNot(Src)) {

23653

Src = Src.getOperand(0);

23654

CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

23655

}

23656

23657

// Attempt to create the X86ISD::BT node.

23658

if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

23659

X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

23660

return BT;

23661

}

23662

23663

return SDValue();

23664

}

23665

23666

// Check if pre-AVX condcode can be performed by a single FCMP op.

23667

static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

23668

return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

23669

}

23670

23671

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

23672

/// CMPs.

23673

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

23674

SDValue &Op1, bool &IsAlwaysSignaling) {

23675

unsigned SSECC;

23676

bool Swap = false;

23677

23678

// SSE Condition code mapping:

23679

// 0 - EQ

23680

// 1 - LT

23681

// 2 - LE

23682

// 3 - UNORD

23683

// 4 - NEQ

23684

// 5 - NLT

23685

// 6 - NLE

23686

// 7 - ORD

23687

switch (SetCCOpcode) {

23688

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23688);

23689

case ISD::SETOEQ:

23690

case ISD::SETEQ: SSECC = 0; break;

23691

case ISD::SETOGT:

23692

case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

23693

case ISD::SETLT:

23694

case ISD::SETOLT: SSECC = 1; break;

23695

case ISD::SETOGE:

23696

case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

23697

case ISD::SETLE:

23698

case ISD::SETOLE: SSECC = 2; break;

23699

case ISD::SETUO: SSECC = 3; break;

23700

case ISD::SETUNE:

23701

case ISD::SETNE: SSECC = 4; break;

23702

case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

23703

case ISD::SETUGE: SSECC = 5; break;

23704

case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

23705

case ISD::SETUGT: SSECC = 6; break;

23706

case ISD::SETO: SSECC = 7; break;

23707

case ISD::SETUEQ: SSECC = 8; break;

23708

case ISD::SETONE: SSECC = 12; break;

23709

}

23710

if (Swap)

23711

std::swap(Op0, Op1);

23712

23713

switch (SetCCOpcode) {

23714

default:

23715

IsAlwaysSignaling = true;

23716

break;

23717

case ISD::SETEQ:

23718

case ISD::SETOEQ:

23719

case ISD::SETUEQ:

23720

case ISD::SETNE:

23721

case ISD::SETONE:

23722

case ISD::SETUNE:

23723

case ISD::SETO:

23724

case ISD::SETUO:

23725

IsAlwaysSignaling = false;

23726

break;

23727

}

23728

23729

return SSECC;

23730

}

23731

23732

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

23733

/// concatenate the result back.

23734

static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,

23735

ISD::CondCode Cond, SelectionDAG &DAG,

23736

const SDLoc &dl) {

23737

assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23738, __extension__
__PRETTY_FUNCTION__))

23738

VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23738, __extension__
__PRETTY_FUNCTION__));

23739

23740

SDValue CC = DAG.getCondCode(Cond);

23741

23742

// Extract the LHS Lo/Hi vectors

23743

SDValue LHS1, LHS2;

23744

std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);

23745

23746

// Extract the RHS Lo/Hi vectors

23747

SDValue RHS1, RHS2;

23748

std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);

23749

23750

// Issue the operation on the smaller types and concatenate the result back

23751

EVT LoVT, HiVT;

23752

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

23753

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

23754

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

23755

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

23756

}

23757

23758

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

23759

23760

SDValue Op0 = Op.getOperand(0);

23761

SDValue Op1 = Op.getOperand(1);

23762

SDValue CC = Op.getOperand(2);

23763

MVT VT = Op.getSimpleValueType();

23764

SDLoc dl(Op);

23765

23766

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23767, __extension__
__PRETTY_FUNCTION__))

23767

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23767, __extension__
__PRETTY_FUNCTION__));

23768

23769

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

23770

23771

// Prefer SETGT over SETLT.

23772

if (SetCCOpcode == ISD::SETLT) {

23773

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

23774

std::swap(Op0, Op1);

23775

}

23776

23777

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

23778

}

23779

23780

/// Given a buildvector constant, return a new vector constant with each element

23781

/// incremented or decremented. If incrementing or decrementing would result in

23782

/// unsigned overflow or underflow or this is not a simple vector constant,

23783

/// return an empty value.

23784

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {

23785

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

23786

if (!BV)

23787

return SDValue();

23788

23789

MVT VT = V.getSimpleValueType();

23790

MVT EltVT = VT.getVectorElementType();

23791

unsigned NumElts = VT.getVectorNumElements();

23792

SmallVector<SDValue, 8> NewVecC;

23793

SDLoc DL(V);

23794

for (unsigned i = 0; i < NumElts; ++i) {

23795

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

23796

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

23797

return SDValue();

23798

23799

// Avoid overflow/underflow.

23800

const APInt &EltC = Elt->getAPIntValue();

23801

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

23802

return SDValue();

23803

23804

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

23805

}

23806

23807

return DAG.getBuildVector(VT, DL, NewVecC);

23808

}

23809

23810

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

23811

/// Op0 u<= Op1:

23812

/// t = psubus Op0, Op1

23813

/// pcmpeq t, <0..0>

23814

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

23815

ISD::CondCode Cond, const SDLoc &dl,

23816

const X86Subtarget &Subtarget,

23817

SelectionDAG &DAG) {

23818

if (!Subtarget.hasSSE2())

23819

return SDValue();

23820

23821

MVT VET = VT.getVectorElementType();

23822

if (VET != MVT::i8 && VET != MVT::i16)

23823

return SDValue();

23824

23825

switch (Cond) {

23826

default:

23827

return SDValue();

23828

case ISD::SETULT: {

23829

// If the comparison is against a constant we can turn this into a

23830

// setule. With psubus, setule does not require a swap. This is

23831

// beneficial because the constant in the register is no longer

23832

// destructed as the destination so it can be hoisted out of a loop.

23833

// Only do this pre-AVX since vpcmp* is no longer destructive.

23834

if (Subtarget.hasAVX())

23835

return SDValue();

23836

SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);

23837

if (!ULEOp1)

23838

return SDValue();

23839

Op1 = ULEOp1;

23840

break;

23841

}

23842

case ISD::SETUGT: {

23843

// If the comparison is against a constant, we can turn this into a setuge.

23844

// This is beneficial because materializing a constant 0 for the PCMPEQ is

23845

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

23846

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

23847

SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);

23848

if (!UGEOp1)

23849

return SDValue();

23850

Op1 = Op0;

23851

Op0 = UGEOp1;

23852

break;

23853

}

23854

// Psubus is better than flip-sign because it requires no inversion.

23855

case ISD::SETUGE:

23856

std::swap(Op0, Op1);

23857

break;

23858

case ISD::SETULE:

23859

break;

23860

}

23861

23862

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

23863

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

23864

DAG.getConstant(0, dl, VT));

23865

}

23866

23867

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

23868

SelectionDAG &DAG) {

23869

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

23870

Op.getOpcode() == ISD::STRICT_FSETCCS;

23871

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

23872

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

23873

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

23874

MVT VT = Op->getSimpleValueType(0);

23875

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

23876

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

23877

SDLoc dl(Op);

23878

23879

if (isFP) {

23880

#ifndef NDEBUG

23881

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

23882

assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23882, __extension__
__PRETTY_FUNCTION__));

23883

#endif

23884

23885

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

23886

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23887

23888

// If we have a strict compare with a vXi1 result and the input is 128/256

23889

// bits we can't use a masked compare unless we have VLX. If we use a wider

23890

// compare like we do for non-strict, we might trigger spurious exceptions

23891

// from the upper elements. Instead emit a AVX compare and convert to mask.

23892

unsigned Opc;

23893

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

23894

(!IsStrict || Subtarget.hasVLX() ||

23895

Op0.getSimpleValueType().is512BitVector())) {

23896

#ifndef NDEBUG

23897

unsigned Num = VT.getVectorNumElements();

23898

assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23898, __extension__
__PRETTY_FUNCTION__));

23899

#endif

23900

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

23901

} else {

23902

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

23903

// The SSE/AVX packed FP comparison nodes are defined with a

23904

// floating-point vector result that matches the operand type. This allows

23905

// them to work with an SSE1 target (integer vector types are not legal).

23906

VT = Op0.getSimpleValueType();

23907

}

23908

23909

SDValue Cmp;

23910

bool IsAlwaysSignaling;

23911

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

23912

if (!Subtarget.hasAVX()) {

23913

// TODO: We could use following steps to handle a quiet compare with

23914

// signaling encodings.

23915

// 1. Get ordered masks from a quiet ISD::SETO

23916

// 2. Use the masks to mask potential unordered elements in operand A, B

23917

// 3. Get the compare results of masked A, B

23918

// 4. Calculating final result using the mask and result from 3

23919

// But currently, we just fall back to scalar operations.

23920

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

23921

return SDValue();

23922

23923

// Insert an extra signaling instruction to raise exception.

23924

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

23925

SDValue SignalCmp = DAG.getNode(

23926

Opc, dl, {VT, MVT::Other},

23927

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

23928

// FIXME: It seems we need to update the flags of all new strict nodes.

23929

// Otherwise, mayRaiseFPException in MI will return false due to

23930

// NoFPExcept = false by default. However, I didn't find it in other

23931

// patches.

23932

SignalCmp->setFlags(Op->getFlags());

23933

Chain = SignalCmp.getValue(1);

23934

}

23935

23936

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

23937

// emit two comparisons and a logic op to tie them together.

23938

if (!cheapX86FSETCC_SSE(Cond)) {

23939

// LLVM predicate is SETUEQ or SETONE.

23940

unsigned CC0, CC1;

23941

unsigned CombineOpc;

23942

if (Cond == ISD::SETUEQ) {

23943

CC0 = 3; // UNORD

23944

CC1 = 0; // EQ

23945

CombineOpc = X86ISD::FOR;

23946

} else {

23947

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23947, __extension__ __PRETTY_FUNCTION__));

23948

CC0 = 7; // ORD

23949

CC1 = 4; // NEQ

23950

CombineOpc = X86ISD::FAND;

23951

}

23952

23953

SDValue Cmp0, Cmp1;

23954

if (IsStrict) {

23955

Cmp0 = DAG.getNode(

23956

Opc, dl, {VT, MVT::Other},

23957

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

23958

Cmp1 = DAG.getNode(

23959

Opc, dl, {VT, MVT::Other},

23960

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

23961

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

23962

Cmp1.getValue(1));

23963

} else {

23964

Cmp0 = DAG.getNode(

23965

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

23966

Cmp1 = DAG.getNode(

23967

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

23968

}

23969

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

23970

} else {

23971

if (IsStrict) {

23972

Cmp = DAG.getNode(

23973

Opc, dl, {VT, MVT::Other},

23974

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

23975

Chain = Cmp.getValue(1);

23976

} else

23977

Cmp = DAG.getNode(

23978

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

23979

}

23980

} else {

23981

// Handle all other FP comparisons here.

23982

if (IsStrict) {

23983

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

23984

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

23985

Cmp = DAG.getNode(

23986

Opc, dl, {VT, MVT::Other},

23987

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

23988

Chain = Cmp.getValue(1);

23989

} else

23990

Cmp = DAG.getNode(

23991

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

23992

}

23993

23994

if (VT.getFixedSizeInBits() >

23995

Op.getSimpleValueType().getFixedSizeInBits()) {

23996

// We emitted a compare with an XMM/YMM result. Finish converting to a

23997

// mask register using a vptestm.

23998

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

23999

Cmp = DAG.getBitcast(CastVT, Cmp);

24000

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

24001

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

24002

} else {

24003

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

24004

// the result type of SETCC. The bitcast is expected to be optimized

24005

// away during combining/isel.

24006

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

24007

}

24008

24009

if (IsStrict)

24010

return DAG.getMergeValues({Cmp, Chain}, dl);

24011

24012

return Cmp;

24013

}

24014

24015

assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24015, __extension__
__PRETTY_FUNCTION__));

24016

24017

MVT VTOp0 = Op0.getSimpleValueType();

24018

(void)VTOp0;

24019

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))

24020

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__));

24021

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24022, __extension__
__PRETTY_FUNCTION__))

24022

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24022, __extension__
__PRETTY_FUNCTION__));

24023

24024

// The non-AVX512 code below works under the assumption that source and

24025

// destination types are the same.

24026

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24027, __extension__
__PRETTY_FUNCTION__))

24027

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24027, __extension__
__PRETTY_FUNCTION__));

24028

24029

// The result is boolean, but operands are int/float

24030

if (VT.getVectorElementType() == MVT::i1) {

24031

// In AVX-512 architecture setcc returns mask with i1 elements,

24032

// But there is no compare instruction for i8 and i16 elements in KNL.

24033

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24034, __extension__
__PRETTY_FUNCTION__))

24034

"Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24034, __extension__
__PRETTY_FUNCTION__));

24035

return LowerIntVSETCC_AVX512(Op, DAG);

24036

}

24037

24038

// Lower using XOP integer comparisons.

24039

if (VT.is128BitVector() && Subtarget.hasXOP()) {

24040

// Translate compare code to XOP PCOM compare mode.

24041

unsigned CmpMode = 0;

24042

switch (Cond) {

24043

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24043);

24044

case ISD::SETULT:

24045

case ISD::SETLT: CmpMode = 0x00; break;

24046

case ISD::SETULE:

24047

case ISD::SETLE: CmpMode = 0x01; break;

24048

case ISD::SETUGT:

24049

case ISD::SETGT: CmpMode = 0x02; break;

24050

case ISD::SETUGE:

24051

case ISD::SETGE: CmpMode = 0x03; break;

24052

case ISD::SETEQ: CmpMode = 0x04; break;

24053

case ISD::SETNE: CmpMode = 0x05; break;

24054

}

24055

24056

// Are we comparing unsigned or signed integers?

24057

unsigned Opc =

24058

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

24059

24060

return DAG.getNode(Opc, dl, VT, Op0, Op1,

24061

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

24062

}

24063

24064

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

24065

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

24066

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

24067

SDValue BC0 = peekThroughBitcasts(Op0);

24068

if (BC0.getOpcode() == ISD::AND) {

24069

APInt UndefElts;

24070

SmallVector<APInt, 64> EltBits;

24071

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

24072

VT.getScalarSizeInBits(), UndefElts,

24073

EltBits, false, false)) {

24074

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

24075

Cond = ISD::SETEQ;

24076

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

24077

}

24078

}

24079

}

24080

}

24081

24082

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

24083

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

24084

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

24085

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

24086

if (C1 && C1->getAPIntValue().isPowerOf2()) {

24087

unsigned BitWidth = VT.getScalarSizeInBits();

24088

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

24089

24090

SDValue Result = Op0.getOperand(0);

24091

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

24092

DAG.getConstant(ShiftAmt, dl, VT));

24093

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

24094

DAG.getConstant(BitWidth - 1, dl, VT));

24095

return Result;

24096

}

24097

}

24098

24099

// Break 256-bit integer vector compare into smaller ones.

24100

if (VT.is256BitVector() && !Subtarget.hasInt256())

24101

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

24102

24103

// Break 512-bit integer vector compare into smaller ones.

24104

// TODO: Try harder to use VPCMPx + VPMOV2x?

24105

if (VT.is512BitVector())

24106

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

24107

24108

// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

24109

// not-of-PCMPEQ:

24110

// X != INT_MIN --> X >s INT_MIN

24111

// X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

24112

// +X != 0 --> +X >s 0

24113

APInt ConstValue;

24114

if (Cond == ISD::SETNE &&

24115

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

24116

if (ConstValue.isMinSignedValue())

24117

Cond = ISD::SETGT;

24118

else if (ConstValue.isMaxSignedValue())

24119

Cond = ISD::SETLT;

24120

else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

24121

Cond = ISD::SETGT;

24122

}

24123

24124

// If both operands are known non-negative, then an unsigned compare is the

24125

// same as a signed compare and there's no need to flip signbits.

24126

// TODO: We could check for more general simplifications here since we're

24127

// computing known bits.

24128

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

24129

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

24130

24131

// Special case: Use min/max operations for unsigned compares.

24132

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24133

if (ISD::isUnsignedIntSetCC(Cond) &&

24134

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

24135

TLI.isOperationLegal(ISD::UMIN, VT)) {

24136

// If we have a constant operand, increment/decrement it and change the

24137

// condition to avoid an invert.

24138

if (Cond == ISD::SETUGT) {

24139

// X > C --> X >= (C+1) --> X == umax(X, C+1)

24140

if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {

24141

Op1 = UGTOp1;

24142

Cond = ISD::SETUGE;

24143

}

24144

}

24145

if (Cond == ISD::SETULT) {

24146

// X < C --> X <= (C-1) --> X == umin(X, C-1)

24147

if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {

24148

Op1 = ULTOp1;

24149

Cond = ISD::SETULE;

24150

}

24151

}

24152

bool Invert = false;

24153

unsigned Opc;

24154

switch (Cond) {

24155

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24155);

24156

case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

24157

case ISD::SETULE: Opc = ISD::UMIN; break;

24158

case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];

24159

case ISD::SETUGE: Opc = ISD::UMAX; break;

24160

}

24161

24162

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

24163

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

24164

24165

// If the logical-not of the result is required, perform that now.

24166

if (Invert)

24167

Result = DAG.getNOT(dl, Result, VT);

24168

24169

return Result;

24170

}

24171

24172

// Try to use SUBUS and PCMPEQ.

24173

if (FlipSigns)

24174

if (SDValue V =

24175

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

24176

return V;

24177

24178

// We are handling one of the integer comparisons here. Since SSE only has

24179

// GT and EQ comparisons for integer, swapping operands and multiple

24180

// operations may be required for some comparisons.

24181

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

24182

: X86ISD::PCMPGT;

24183

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

24184

Cond == ISD::SETGE || Cond == ISD::SETUGE;

24185

bool Invert = Cond == ISD::SETNE ||

24186

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

24187

24188

if (Swap)

24189

std::swap(Op0, Op1);

24190

24191

// Check that the operation in question is available (most are plain SSE2,

24192

// but PCMPGTQ and PCMPEQQ have different requirements).

24193

if (VT == MVT::v2i64) {

24194

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

24195

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24195, __extension__
__PRETTY_FUNCTION__));

24196

24197

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

24198

// the odd elements over the even elements.

24199

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

24200

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

24201

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

24202

24203

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

24204

static const int MaskHi[] = { 1, 1, 3, 3 };

24205

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

24206

24207

return DAG.getBitcast(VT, Result);

24208

}

24209

24210

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

24211

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

24212

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

24213

24214

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

24215

static const int MaskHi[] = { 1, 1, 3, 3 };

24216

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

24217

24218

return DAG.getBitcast(VT, Result);

24219

}

24220

24221

// Since SSE has no unsigned integer comparisons, we need to flip the sign

24222

// bits of the inputs before performing those operations. The lower

24223

// compare is always unsigned.

24224

SDValue SB;

24225

if (FlipSigns) {

24226

SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);

24227

} else {

24228

SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);

24229

}

24230

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

24231

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

24232

24233

// Cast everything to the right type.

24234

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

24235

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

24236

24237

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

24238

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

24239

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

24240

24241

// Create masks for only the low parts/high parts of the 64 bit integers.

24242

static const int MaskHi[] = { 1, 1, 3, 3 };

24243

static const int MaskLo[] = { 0, 0, 2, 2 };

24244

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

24245

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

24246

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

24247

24248

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

24249

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

24250

24251

if (Invert)

24252

Result = DAG.getNOT(dl, Result, MVT::v4i32);

24253

24254

return DAG.getBitcast(VT, Result);

24255

}

24256

24257

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

24258

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

24259

// pcmpeqd + pshufd + pand.

24260

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24260, __extension__
__PRETTY_FUNCTION__));

24261

24262

// First cast everything to the right type.

24263

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

24264

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

24265

24266

// Do the compare.

24267

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

24268

24269

// Make sure the lower and upper halves are both all-ones.

24270

static const int Mask[] = { 1, 0, 3, 2 };

24271

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

24272

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

24273

24274

if (Invert)

24275

Result = DAG.getNOT(dl, Result, MVT::v4i32);

24276

24277

return DAG.getBitcast(VT, Result);

24278

}

24279

}

24280

24281

// Since SSE has no unsigned integer comparisons, we need to flip the sign

24282

// bits of the inputs before performing those operations.

24283

if (FlipSigns) {

24284

MVT EltVT = VT.getVectorElementType();

24285

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

24286

VT);

24287

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

24288

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

24289

}

24290

24291

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

24292

24293

// If the logical-not of the result is required, perform that now.

24294

if (Invert)

24295

Result = DAG.getNOT(dl, Result, VT);

24296

24297

return Result;

24298

}

24299

24300

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

24301

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

24302

const SDLoc &dl, SelectionDAG &DAG,

24303

const X86Subtarget &Subtarget,

24304

SDValue &X86CC) {

24305

// Only support equality comparisons.

24306

if (CC != ISD::SETEQ && CC != ISD::SETNE)

24307

return SDValue();

24308

24309

// Must be a bitcast from vXi1.

24310

if (Op0.getOpcode() != ISD::BITCAST)

24311

return SDValue();

24312

24313

Op0 = Op0.getOperand(0);

24314

MVT VT = Op0.getSimpleValueType();

24315

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

24316

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

24317

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

24318

return SDValue();

24319

24320

X86::CondCode X86Cond;

24321

if (isNullConstant(Op1)) {

24322

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

24323

} else if (isAllOnesConstant(Op1)) {

24324

// C flag is set for all ones.

24325

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

24326

} else

24327

return SDValue();

24328

24329

// If the input is an AND, we can combine it's operands into the KTEST.

24330

bool KTestable = false;

24331

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

24332

KTestable = true;

24333

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

24334

KTestable = true;

24335

if (!isNullConstant(Op1))

24336

KTestable = false;

24337

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

24338

SDValue LHS = Op0.getOperand(0);

24339

SDValue RHS = Op0.getOperand(1);

24340

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

24341

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

24342

}

24343

24344

// If the input is an OR, we can combine it's operands into the KORTEST.

24345

SDValue LHS = Op0;

24346

SDValue RHS = Op0;

24347

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

24348

LHS = Op0.getOperand(0);

24349

RHS = Op0.getOperand(1);

24350

}

24351

24352

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

24353

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

24354

}

24355

24356

/// Emit flags for the given setcc condition and operands. Also returns the

24357

/// corresponding X86 condition code constant in X86CC.

24358

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

24359

ISD::CondCode CC, const SDLoc &dl,

24360

SelectionDAG &DAG,

24361

SDValue &X86CC) const {

24362

// Optimize to BT if possible.

24363

// Lower (X & (1 << N)) == 0 to BT(X, N).

24364

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

24365

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

24366

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&

24367

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

24368

X86::CondCode X86CondCode;

24369

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

24370

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

24371

return BT;

24372

}

24373

}

24374

24375

// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.

24376

// TODO: We could do AND tree with all 1s as well by using the C flag.

24377

if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))

24378

if (SDValue CmpZ =

24379

MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))

24380

return CmpZ;

24381

24382

// Try to lower using KORTEST or KTEST.

24383

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

24384

return Test;

24385

24386

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of

24387

// these.

24388

if ((isOneConstant(Op1) || isNullConstant(Op1)) &&

24389

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

24390

// If the input is a setcc, then reuse the input setcc or use a new one with

24391

// the inverted condition.

24392

if (Op0.getOpcode() == X86ISD::SETCC) {

24393

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

24394

24395

X86CC = Op0.getOperand(0);

24396

if (Invert) {

24397

X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);

24398

CCode = X86::GetOppositeBranchCondition(CCode);

24399

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

24400

}

24401

24402

return Op0.getOperand(1);

24403

}

24404

}

24405

24406

// Try to use the carry flag from the add in place of an separate CMP for:

24407

// (seteq (add X, -1), -1). Similar for setne.

24408

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

24409

Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

24410

if (isProfitableToUseFlagOp(Op0)) {

24411

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

24412

24413

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

24414

Op0.getOperand(1));

24415

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

24416

X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

24417

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

24418

return SDValue(New.getNode(), 1);

24419

}

24420

}

24421

24422

X86::CondCode CondCode =

24423

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

24424

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24424, __extension__
__PRETTY_FUNCTION__));

24425

24426

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

24427

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

24428

return EFLAGS;

24429

}

24430

24431

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

24432

24433

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

24434

Op.getOpcode() == ISD::STRICT_FSETCCS;

24435

MVT VT = Op->getSimpleValueType(0);

24436

24437

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

24438

24439

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24439, __extension__
__PRETTY_FUNCTION__));

24440

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

24441

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

24442

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

24443

SDLoc dl(Op);

24444

ISD::CondCode CC =

24445

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

24446

24447

// Handle f128 first, since one possible outcome is a normal integer

24448

// comparison which gets handled by emitFlagsForSetcc.

24449

if (Op0.getValueType() == MVT::f128) {

24450

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

24451

Op.getOpcode() == ISD::STRICT_FSETCCS);

24452

24453

// If softenSetCCOperands returned a scalar, use it.

24454

if (!Op1.getNode()) {

24455

assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24456, __extension__
__PRETTY_FUNCTION__))

24456

"Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24456, __extension__
__PRETTY_FUNCTION__));

24457

if (IsStrict)

24458

return DAG.getMergeValues({Op0, Chain}, dl);

24459

return Op0;

24460

}

24461

}

24462

24463

if (Op0.getSimpleValueType().isInteger()) {

24464

// Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

24465

// reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

24466

// this may translate to less uops depending on uarch implementation. The

24467

// equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

24468

// canonicalize to that CondCode.

24469

// NOTE: Only do this if incrementing the constant doesn't increase the bit

24470

// encoding size - so it must either already be a i8 or i32 immediate, or it

24471

// shrinks down to that. We don't do this for any i64's to avoid additional

24472

// constant materializations.

24473

// TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

24474

if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

24475

const APInt &Op1Val = Op1C->getAPIntValue();

24476

if (!Op1Val.isZero()) {

24477

// Ensure the constant+1 doesn't overflow.

24478

if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

24479

(CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

24480

APInt Op1ValPlusOne = Op1Val + 1;

24481

if (Op1ValPlusOne.isSignedIntN(32) &&

24482

(!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

24483

Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

24484

CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

24485

: ISD::CondCode::SETUGE;

24486

}

24487

}

24488

}

24489

}

24490

24491

SDValue X86CC;

24492

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

24493

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

24494

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

24495

}

24496

24497

// Handle floating point.

24498

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

24499

if (CondCode == X86::COND_INVALID)

24500

return SDValue();

24501

24502

SDValue EFLAGS;

24503

if (IsStrict) {

24504

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

24505

EFLAGS =

24506

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

24507

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

24508

Chain = EFLAGS.getValue(1);

24509

} else {

24510

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

24511

}

24512

24513

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

24514

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

24515

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

24516

}

24517

24518

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

24519

SDValue LHS = Op.getOperand(0);

24520

SDValue RHS = Op.getOperand(1);

24521

SDValue Carry = Op.getOperand(2);

24522

SDValue Cond = Op.getOperand(3);

24523

SDLoc DL(Op);

24524

24525

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24525, __extension__
__PRETTY_FUNCTION__));

24526

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

24527

24528

// Recreate the carry if needed.

24529

EVT CarryVT = Carry.getValueType();

24530

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

24531

Carry, DAG.getAllOnesConstant(DL, CarryVT));

24532

24533

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

24534

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

24535

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

24536

}

24537

24538

// This function returns three things: the arithmetic computation itself

24539

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

24540

// flag and the condition code define the case in which the arithmetic

24541

// computation overflows.

24542

static std::pair<SDValue, SDValue>

24543

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

24544

assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24544, __extension__
__PRETTY_FUNCTION__));

24545

SDValue Value, Overflow;

24546

SDValue LHS = Op.getOperand(0);

24547

SDValue RHS = Op.getOperand(1);

24548

unsigned BaseOp = 0;

24549

SDLoc DL(Op);

24550

switch (Op.getOpcode()) {

24551

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 24551);

24552

case ISD::SADDO:

24553

BaseOp = X86ISD::ADD;

24554

Cond = X86::COND_O;

24555

break;

24556

case ISD::UADDO:

24557

BaseOp = X86ISD::ADD;

24558

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

24559

break;

24560

case ISD::SSUBO:

24561

BaseOp = X86ISD::SUB;

24562

Cond = X86::COND_O;

24563

break;

24564

case ISD::USUBO:

24565

BaseOp = X86ISD::SUB;

24566

Cond = X86::COND_B;

24567

break;

24568

case ISD::SMULO:

24569

BaseOp = X86ISD::SMUL;

24570

Cond = X86::COND_O;

24571

break;

24572

case ISD::UMULO:

24573

BaseOp = X86ISD::UMUL;

24574

Cond = X86::COND_O;

24575

break;

24576

}

24577

24578

if (BaseOp) {

24579

// Also sets EFLAGS.

24580

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24581

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

24582

Overflow = Value.getValue(1);

24583

}

24584

24585

return std::make_pair(Value, Overflow);

24586

}

24587

24588

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

24589

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

24590

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

24591

// looks for this combo and may remove the "setcc" instruction if the "setcc"

24592

// has only one use.

24593

SDLoc DL(Op);

24594

X86::CondCode Cond;

24595

SDValue Value, Overflow;

24596

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

24597

24598

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

24599

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24599, __extension__
__PRETTY_FUNCTION__));

24600

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

24601

}

24602

24603

/// Return true if opcode is a X86 logical comparison.

24604

static bool isX86LogicalCmp(SDValue Op) {

24605

unsigned Opc = Op.getOpcode();

24606

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

24607

Opc == X86ISD::FCMP)

24608

return true;

24609

if (Op.getResNo() == 1 &&

24610

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

24611

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

24612

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

24613

return true;

24614

24615

return false;

24616

}

24617

24618

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

24619

if (V.getOpcode() != ISD::TRUNCATE)

24620

return false;

24621

24622

SDValue VOp0 = V.getOperand(0);

24623

unsigned InBits = VOp0.getValueSizeInBits();

24624

unsigned Bits = V.getValueSizeInBits();

24625

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

24626

}

24627

24628

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

24629

bool AddTest = true;

24630

SDValue Cond = Op.getOperand(0);

24631

SDValue Op1 = Op.getOperand(1);

24632

SDValue Op2 = Op.getOperand(2);

24633

SDLoc DL(Op);

24634

MVT VT = Op1.getSimpleValueType();

24635

SDValue CC;

24636

24637

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

24638

// are available or VBLENDV if AVX is available.

24639

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

24640

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

24641

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

24642

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

24643

bool IsAlwaysSignaling;

24644

unsigned SSECC =

24645

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

24646

CondOp0, CondOp1, IsAlwaysSignaling);

24647

24648

if (Subtarget.hasAVX512()) {

24649

SDValue Cmp =

24650

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

24651

DAG.getTargetConstant(SSECC, DL, MVT::i8));

24652

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24652, __extension__
__PRETTY_FUNCTION__));

24653

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

24654

}

24655

24656

if (SSECC < 8 || Subtarget.hasAVX()) {

24657

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

24658

DAG.getTargetConstant(SSECC, DL, MVT::i8));

24659

24660

// If we have AVX, we can use a variable vector select (VBLENDV) instead

24661

// of 3 logic instructions for size savings and potentially speed.

24662

// Unfortunately, there is no scalar form of VBLENDV.

24663

24664

// If either operand is a +0.0 constant, don't try this. We can expect to

24665

// optimize away at least one of the logic instructions later in that

24666

// case, so that sequence would be faster than a variable blend.

24667

24668

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

24669

// uses XMM0 as the selection register. That may need just as many

24670

// instructions as the AND/ANDN/OR sequence due to register moves, so

24671

// don't bother.

24672

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

24673

!isNullFPConstant(Op2)) {

24674

// Convert to vectors, do a VSELECT, and convert back to scalar.

24675

// All of the conversions should be optimized away.

24676

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

24677

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

24678

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

24679

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

24680

24681

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

24682

VCmp = DAG.getBitcast(VCmpVT, VCmp);

24683

24684

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

24685

24686

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

24687

VSel, DAG.getIntPtrConstant(0, DL));

24688

}

24689

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

24690

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

24691

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

24692

}

24693

}

24694

24695

// AVX512 fallback is to lower selects of scalar floats to masked moves.

24696

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

24697

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

24698

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

24699

}

24700

24701

if (Cond.getOpcode() == ISD::SETCC) {

24702

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

24703

Cond = NewCond;

24704

// If the condition was updated, it's possible that the operands of the

24705

// select were also updated (for example, EmitTest has a RAUW). Refresh

24706

// the local references to the select operands in case they got stale.

24707

Op1 = Op.getOperand(1);

24708

Op2 = Op.getOperand(2);

24709

}

24710

}

24711

24712

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

24713

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

24714

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

24715

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

24716

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

24717

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

24718

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

24719

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

24720

if (Cond.getOpcode() == X86ISD::SETCC &&

24721

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

24722

isNullConstant(Cond.getOperand(1).getOperand(1))) {

24723

SDValue Cmp = Cond.getOperand(1);

24724

SDValue CmpOp0 = Cmp.getOperand(0);

24725

unsigned CondCode = Cond.getConstantOperandVal(0);

24726

24727

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

24728

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

24729

// handle to keep the CMP with 0. This should be removed by

24730

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

24731

// cttz_zero_undef.

24732

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

24733

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

24734

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

24735

};

24736

if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

24737

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

24738

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

24739

// Keep Cmp.

24740

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

24741

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

24742

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

24743

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

24744

24745

// 'X - 1' sets the carry flag if X == 0.

24746

// '0 - X' sets the carry flag if X != 0.

24747

// Convert the carry flag to a -1/0 mask with sbb:

24748

// select (X != 0), -1, Y --> 0 - X; or (sbb), Y

24749

// select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

24750

// select (X != 0), Y, -1 --> X - 1; or (sbb), Y

24751

// select (X == 0), -1, Y --> X - 1; or (sbb), Y

24752

SDValue Sub;

24753

if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {

24754

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

24755

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

24756

} else {

24757

SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());

24758

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);

24759

}

24760

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

24761

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

24762

Sub.getValue(1));

24763

return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

24764

} else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&

24765

Cmp.getOperand(0).getOpcode() == ISD::AND &&

24766

isOneConstant(Cmp.getOperand(0).getOperand(1))) {

24767

SDValue Src1, Src2;

24768

// true if Op2 is XOR or OR operator and one of its operands

24769

// is equal to Op1

24770

// ( a , a op b) || ( b , a op b)

24771

auto isOrXorPattern = [&]() {

24772

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

24773

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

24774

Src1 =

24775

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

24776

Src2 = Op1;

24777

return true;

24778

}

24779

return false;

24780

};

24781

24782

if (isOrXorPattern()) {

24783

SDValue Neg;

24784

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

24785

// we need mask of all zeros or ones with same size of the other

24786

// operands.

24787

if (CmpSz > VT.getSizeInBits())

24788

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

24789

else if (CmpSz < VT.getSizeInBits())

24790

Neg = DAG.getNode(ISD::AND, DL, VT,

24791

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

24792

DAG.getConstant(1, DL, VT));

24793

else

24794

Neg = CmpOp0;

24795

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

24796

Neg); // -(and (x, 0x1))

24797

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

24798

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

24799

}

24800

} else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&

24801

Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

24802

((CondCode == X86::COND_S) || // smin(x, 0)

24803

(CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

24804

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

24805

//

24806

// If the comparison is testing for a positive value, we have to invert

24807

// the sign bit mask, so only do that transform if the target has a

24808

// bitwise 'and not' instruction (the invert is free).

24809

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

24810

unsigned ShCt = VT.getSizeInBits() - 1;

24811

SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

24812

SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

24813

if (CondCode == X86::COND_G)

24814

Shift = DAG.getNOT(DL, Shift, VT);

24815

return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

24816

}

24817

}

24818

24819

// Look past (and (setcc_carry (cmp ...)), 1).

24820

if (Cond.getOpcode() == ISD::AND &&

24821

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

24822

isOneConstant(Cond.getOperand(1)))

24823

Cond = Cond.getOperand(0);

24824

24825

// If condition flag is set by a X86ISD::CMP, then use it as the condition

24826

// setting operand in place of the X86ISD::SETCC.

24827

unsigned CondOpcode = Cond.getOpcode();

24828

if (CondOpcode == X86ISD::SETCC ||

24829

CondOpcode == X86ISD::SETCC_CARRY) {

24830

CC = Cond.getOperand(0);

24831

24832

SDValue Cmp = Cond.getOperand(1);

24833

bool IllegalFPCMov = false;

24834

if (VT.isFloatingPoint() && !VT.isVector() &&

24835

!isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?

24836

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

24837

24838

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

24839

Cmp.getOpcode() == X86ISD::BT) { // FIXME

24840

Cond = Cmp;

24841

AddTest = false;

24842

}

24843

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

24844

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

24845

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

24846

SDValue Value;

24847

X86::CondCode X86Cond;

24848

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

24849

24850

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

24851

AddTest = false;

24852

}

24853

24854

if (AddTest) {

24855

// Look past the truncate if the high bits are known zero.

24856

if (isTruncWithZeroHighBitsInput(Cond, DAG))

24857

Cond = Cond.getOperand(0);

24858

24859

// We know the result of AND is compared against zero. Try to match

24860

// it to BT.

24861

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

24862

X86::CondCode X86CondCode;

24863

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

24864

CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

24865

Cond = BT;

24866

AddTest = false;

24867

}

24868

}

24869

}

24870

24871

if (AddTest) {

24872

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

24873

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

24874

}

24875

24876

// a < b ? -1 : 0 -> RES = ~setcc_carry

24877

// a < b ? 0 : -1 -> RES = setcc_carry

24878

// a >= b ? -1 : 0 -> RES = setcc_carry

24879

// a >= b ? 0 : -1 -> RES = ~setcc_carry

24880

if (Cond.getOpcode() == X86ISD::SUB) {

24881

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

24882

24883

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

24884

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

24885

(isNullConstant(Op1) || isNullConstant(Op2))) {

24886

SDValue Res =

24887

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

24888

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

24889

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

24890

return DAG.getNOT(DL, Res, Res.getValueType());

24891

return Res;

24892

}

24893

}

24894

24895

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

24896

// widen the cmov and push the truncate through. This avoids introducing a new

24897

// branch during isel and doesn't add any extensions.

24898

if (Op.getValueType() == MVT::i8 &&

24899

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

24900

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

24901

if (T1.getValueType() == T2.getValueType() &&

24902

// Exclude CopyFromReg to avoid partial register stalls.

24903

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

24904

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

24905

CC, Cond);

24906

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

24907

}

24908

}

24909

24910

// Or finally, promote i8 cmovs if we have CMOV,

24911

// or i16 cmovs if it won't prevent folding a load.

24912

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

24913

// legal, but EmitLoweredSelect() can not deal with these extensions

24914

// being inserted between two CMOV's. (in i16 case too TBN)

24915

// https://bugs.llvm.org/show_bug.cgi?id=40974

24916

if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

24917

(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

24918

!X86::mayFoldLoad(Op2, Subtarget))) {

24919

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

24920

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

24921

SDValue Ops[] = { Op2, Op1, CC, Cond };

24922

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

24923

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

24924

}

24925

24926

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

24927

// condition is true.

24928

SDValue Ops[] = { Op2, Op1, CC, Cond };

24929

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

24930

}

24931

24932

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

24933

const X86Subtarget &Subtarget,

24934

SelectionDAG &DAG) {

24935

MVT VT = Op->getSimpleValueType(0);

24936

SDValue In = Op->getOperand(0);

24937

MVT InVT = In.getSimpleValueType();

24938

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24938, __extension__
__PRETTY_FUNCTION__));

24939

MVT VTElt = VT.getVectorElementType();

24940

SDLoc dl(Op);

24941

24942

unsigned NumElts = VT.getVectorNumElements();

24943

24944

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

24945

MVT ExtVT = VT;

24946

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

24947

// If v16i32 is to be avoided, we'll need to split and concatenate.

24948

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

24949

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

24950

24951

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

24952

}

24953

24954

// Widen to 512-bits if VLX is not supported.

24955

MVT WideVT = ExtVT;

24956

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

24957

NumElts *= 512 / ExtVT.getSizeInBits();

24958

InVT = MVT::getVectorVT(MVT::i1, NumElts);

24959

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

24960

In, DAG.getIntPtrConstant(0, dl));

24961

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

24962

}

24963

24964

SDValue V;

24965

MVT WideEltVT = WideVT.getVectorElementType();

24966

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

24967

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

24968

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

24969

} else {

24970

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

24971

SDValue Zero = DAG.getConstant(0, dl, WideVT);

24972

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

24973

}

24974

24975

// Truncate if we had to extend i16/i8 above.

24976

if (VT != ExtVT) {

24977

WideVT = MVT::getVectorVT(VTElt, NumElts);

24978

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

24979

}

24980

24981

// Extract back to 128/256-bit if we widened.

24982

if (WideVT != VT)

24983

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

24984

DAG.getIntPtrConstant(0, dl));

24985

24986

return V;

24987

}

24988

24989

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

24990

SelectionDAG &DAG) {

24991

SDValue In = Op->getOperand(0);

24992

MVT InVT = In.getSimpleValueType();

24993

24994

if (InVT.getVectorElementType() == MVT::i1)

24995

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

24996

24997

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24997, __extension__
__PRETTY_FUNCTION__));

24998

return LowerAVXExtend(Op, DAG, Subtarget);

24999

}

25000

25001

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

25002

// For sign extend this needs to handle all vector sizes and SSE4.1 and

25003

// non-SSE4.1 targets. For zero extend this should only handle inputs of

25004

// MVT::v64i8 when BWI is not supported, but AVX512 is.

25005

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

25006

const X86Subtarget &Subtarget,

25007

SelectionDAG &DAG) {

25008

SDValue In = Op->getOperand(0);

25009

MVT VT = Op->getSimpleValueType(0);

25010

MVT InVT = In.getSimpleValueType();

25011

25012

MVT SVT = VT.getVectorElementType();

25013

MVT InSVT = InVT.getVectorElementType();

25014

assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25014, __extension__
__PRETTY_FUNCTION__));

25015

25016

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

25017

return SDValue();

25018

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

25019

return SDValue();

25020

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

25021

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

25022

!(VT.is512BitVector() && Subtarget.hasAVX512()))

25023

return SDValue();

25024

25025

SDLoc dl(Op);

25026

unsigned Opc = Op.getOpcode();

25027

unsigned NumElts = VT.getVectorNumElements();

25028

25029

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

25030

// For 512-bit vectors, we need 128-bits or 256-bits.

25031

if (InVT.getSizeInBits() > 128) {

25032

// Input needs to be at least the same number of elements as output, and

25033

// at least 128-bits.

25034

int InSize = InSVT.getSizeInBits() * NumElts;

25035

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

25036

InVT = In.getSimpleValueType();

25037

}

25038

25039

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

25040

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

25041

// need to be handled here for 256/512-bit results.

25042

if (Subtarget.hasInt256()) {

25043

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25043, __extension__
__PRETTY_FUNCTION__));

25044

25045

if (InVT.getVectorNumElements() != NumElts)

25046

return DAG.getNode(Op.getOpcode(), dl, VT, In);

25047

25048

// FIXME: Apparently we create inreg operations that could be regular

25049

// extends.

25050

unsigned ExtOpc =

25051

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

25052

: ISD::ZERO_EXTEND;

25053

return DAG.getNode(ExtOpc, dl, VT, In);

25054

}

25055

25056

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

25057

if (Subtarget.hasAVX()) {

25058

assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25058, __extension__
__PRETTY_FUNCTION__));

25059

MVT HalfVT = VT.getHalfNumVectorElementsVT();

25060

int HalfNumElts = HalfVT.getVectorNumElements();

25061

25062

unsigned NumSrcElts = InVT.getVectorNumElements();

25063

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

25064

for (int i = 0; i != HalfNumElts; ++i)

25065

HiMask[i] = HalfNumElts + i;

25066

25067

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

25068

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

25069

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

25070

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

25071

}

25072

25073

// We should only get here for sign extend.

25074

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25074, __extension__
__PRETTY_FUNCTION__));

25075

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25075, __extension__
__PRETTY_FUNCTION__));

25076

25077

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

25078

SDValue Curr = In;

25079

SDValue SignExt = Curr;

25080

25081

// As SRAI is only available on i16/i32 types, we expand only up to i32

25082

// and handle i64 separately.

25083

if (InVT != MVT::v4i32) {

25084

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

25085

25086

unsigned DestWidth = DestVT.getScalarSizeInBits();

25087

unsigned Scale = DestWidth / InSVT.getSizeInBits();

25088

25089

unsigned InNumElts = InVT.getVectorNumElements();

25090

unsigned DestElts = DestVT.getVectorNumElements();

25091

25092

// Build a shuffle mask that takes each input element and places it in the

25093

// MSBs of the new element size.

25094

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

25095

for (unsigned i = 0; i != DestElts; ++i)

25096

Mask[i * Scale + (Scale - 1)] = i;

25097

25098

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

25099

Curr = DAG.getBitcast(DestVT, Curr);

25100

25101

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

25102

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

25103

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

25104

}

25105

25106

if (VT == MVT::v2i64) {

25107

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25107, __extension__
__PRETTY_FUNCTION__));

25108

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

25109

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

25110

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

25111

SignExt = DAG.getBitcast(VT, SignExt);

25112

}

25113

25114

return SignExt;

25115

}

25116

25117

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

25118

SelectionDAG &DAG) {

25119

MVT VT = Op->getSimpleValueType(0);

25120

SDValue In = Op->getOperand(0);

25121

MVT InVT = In.getSimpleValueType();

25122

SDLoc dl(Op);

25123

25124

if (InVT.getVectorElementType() == MVT::i1)

25125

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

25126

25127

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25127, __extension__
__PRETTY_FUNCTION__));

25128

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25129, __extension__
__PRETTY_FUNCTION__))

25129

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25129, __extension__
__PRETTY_FUNCTION__));

25130

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))

25131

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))

25132

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__))

25133

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25133, __extension__
__PRETTY_FUNCTION__));

25134

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))

25135

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))

25136

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__))

25137

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25137, __extension__
__PRETTY_FUNCTION__));

25138

25139

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

25140

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25140, __extension__
__PRETTY_FUNCTION__));

25141

return splitVectorIntUnary(Op, DAG);

25142

}

25143

25144

if (Subtarget.hasInt256())

25145

return Op;

25146

25147

// Optimize vectors in AVX mode

25148

// Sign extend v8i16 to v8i32 and

25149

// v4i32 to v4i64

25150

//

25151

// Divide input vector into two parts

25152

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

25153

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

25154

// concat the vectors to original VT

25155

MVT HalfVT = VT.getHalfNumVectorElementsVT();

25156

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

25157

25158

unsigned NumElems = InVT.getVectorNumElements();

25159

SmallVector<int,8> ShufMask(NumElems, -1);

25160

for (unsigned i = 0; i != NumElems/2; ++i)

25161

ShufMask[i] = i + NumElems/2;

25162

25163

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

25164

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

25165

25166

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

25167

}

25168

25169

/// Change a vector store into a pair of half-size vector stores.

25170

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

25171

SDValue StoredVal = Store->getValue();

25172

assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))

25173

StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__))

25174

"Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25174, __extension__
__PRETTY_FUNCTION__));

25175

25176

// Splitting volatile memory ops is not allowed unless the operation was not

25177

// legal to begin with. Assume the input store is legal (this transform is

25178

// only used for targets with AVX). Note: It is possible that we have an

25179

// illegal type like v2i128, and so we could allow splitting a volatile store

25180

// in that case if that is important.

25181

if (!Store->isSimple())

25182

return SDValue();

25183

25184

SDLoc DL(Store);

25185

SDValue Value0, Value1;

25186

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

25187

unsigned HalfOffset = Value0.getValueType().getStoreSize();

25188

SDValue Ptr0 = Store->getBasePtr();

25189

SDValue Ptr1 =

25190

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

25191

SDValue Ch0 =

25192

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

25193

Store->getOriginalAlign(),

25194

Store->getMemOperand()->getFlags());

25195

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

25196

Store->getPointerInfo().getWithOffset(HalfOffset),

25197

Store->getOriginalAlign(),

25198

Store->getMemOperand()->getFlags());

25199

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

25200

}

25201

25202

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

25203

/// type.

25204

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

25205

SelectionDAG &DAG) {

25206

SDValue StoredVal = Store->getValue();

25207

assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25208, __extension__
__PRETTY_FUNCTION__))

25208

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25208, __extension__
__PRETTY_FUNCTION__));

25209

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

25210

25211

// Splitting volatile memory ops is not allowed unless the operation was not

25212

// legal to begin with. We are assuming the input op is legal (this transform

25213

// is only used for targets with AVX).

25214

if (!Store->isSimple())

25215

return SDValue();

25216

25217

MVT StoreSVT = StoreVT.getScalarType();

25218

unsigned NumElems = StoreVT.getVectorNumElements();

25219

unsigned ScalarSize = StoreSVT.getStoreSize();

25220

25221

SDLoc DL(Store);

25222

SmallVector<SDValue, 4> Stores;

25223

for (unsigned i = 0; i != NumElems; ++i) {

25224

unsigned Offset = i * ScalarSize;

25225

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

25226

TypeSize::Fixed(Offset), DL);

25227

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

25228

DAG.getIntPtrConstant(i, DL));

25229

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

25230

Store->getPointerInfo().getWithOffset(Offset),

25231

Store->getOriginalAlign(),

25232

Store->getMemOperand()->getFlags());

25233

Stores.push_back(Ch);

25234

}

25235

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

25236

}

25237

25238

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

25239

SelectionDAG &DAG) {

25240

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

25241

SDLoc dl(St);

25242

SDValue StoredVal = St->getValue();

25243

25244

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

25245

if (StoredVal.getValueType().isVector() &&

25246

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

25247

unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

25248

assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25248, __extension__
__PRETTY_FUNCTION__));

25249

assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25249, __extension__
__PRETTY_FUNCTION__));

25250

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25251, __extension__
__PRETTY_FUNCTION__))

25251

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25251, __extension__
__PRETTY_FUNCTION__));

25252

25253

// We must pad with zeros to ensure we store zeroes to any unused bits.

25254

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

25255

DAG.getUNDEF(MVT::v16i1), StoredVal,

25256

DAG.getIntPtrConstant(0, dl));

25257

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

25258

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

25259

// Make sure we store zeros in the extra bits.

25260

if (NumElts < 8)

25261

StoredVal = DAG.getZeroExtendInReg(

25262

StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));

25263

25264

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

25265

St->getPointerInfo(), St->getOriginalAlign(),

25266

St->getMemOperand()->getFlags());

25267

}

25268

25269

if (St->isTruncatingStore())

25270

return SDValue();

25271

25272

// If this is a 256-bit store of concatenated ops, we are better off splitting

25273

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

25274

// and each half can execute independently. Some cores would split the op into

25275

// halves anyway, so the concat (vinsertf128) is purely an extra op.

25276

MVT StoreVT = StoredVal.getSimpleValueType();

25277

if (StoreVT.is256BitVector() ||

25278

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

25279

!Subtarget.hasBWI())) {

25280

SmallVector<SDValue, 4> CatOps;

25281

if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))

25282

return splitVectorStore(St, DAG);

25283

return SDValue();

25284

}

25285

25286

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25287

assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25288, __extension__
__PRETTY_FUNCTION__))

25288

"Unexpected VT")(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25288, __extension__
__PRETTY_FUNCTION__));

25289

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25290, __extension__
__PRETTY_FUNCTION__))

25290

TargetLowering::TypeWidenVector && "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25290, __extension__
__PRETTY_FUNCTION__));

25291

25292

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

25293

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

25294

DAG.getUNDEF(StoreVT));

25295

25296

if (Subtarget.hasSSE2()) {

25297

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

25298

// and store it.

25299

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

25300

MVT CastVT = MVT::getVectorVT(StVT, 2);

25301

StoredVal = DAG.getBitcast(CastVT, StoredVal);

25302

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

25303

DAG.getIntPtrConstant(0, dl));

25304

25305

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

25306

St->getPointerInfo(), St->getOriginalAlign(),

25307

St->getMemOperand()->getFlags());

25308

}

25309

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25309, __extension__
__PRETTY_FUNCTION__));

25310

SDVTList Tys = DAG.getVTList(MVT::Other);

25311

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

25312

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

25313

St->getMemOperand());

25314

}

25315

25316

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

25317

// may emit an illegal shuffle but the expansion is still better than scalar

25318

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

25319

// we'll emit a shuffle and a arithmetic shift.

25320

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

25321

// TODO: It is possible to support ZExt by zeroing the undef values during

25322

// the shuffle phase or after the shuffle.

25323

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

25324

SelectionDAG &DAG) {

25325

MVT RegVT = Op.getSimpleValueType();

25326

assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25326, __extension__
__PRETTY_FUNCTION__));

25327

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25328, __extension__
__PRETTY_FUNCTION__))

25328

"We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25328, __extension__
__PRETTY_FUNCTION__));

25329

25330

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

25331

SDLoc dl(Ld);

25332

25333

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

25334

if (RegVT.getVectorElementType() == MVT::i1) {

25335

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25335, __extension__
__PRETTY_FUNCTION__));

25336

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25336, __extension__
__PRETTY_FUNCTION__));

25337

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25338, __extension__
__PRETTY_FUNCTION__))

25338

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25338, __extension__
__PRETTY_FUNCTION__));

25339

25340

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

25341

Ld->getPointerInfo(), Ld->getOriginalAlign(),

25342

Ld->getMemOperand()->getFlags());

25343

25344

// Replace chain users with the new chain.

25345

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25345, __extension__
__PRETTY_FUNCTION__));

25346

25347

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

25348

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

25349

DAG.getBitcast(MVT::v16i1, Val),

25350

DAG.getIntPtrConstant(0, dl));

25351

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

25352

}

25353

25354

return SDValue();

25355

}

25356

25357

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

25358

/// each of which has no other use apart from the AND / OR.

25359

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

25360

Opc = Op.getOpcode();

25361

if (Opc != ISD::OR && Opc != ISD::AND)

25362

return false;

25363

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

25364

Op.getOperand(0).hasOneUse() &&

25365

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

25366

Op.getOperand(1).hasOneUse());

25367

}

25368

25369

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

25370

SDValue Chain = Op.getOperand(0);

25371

SDValue Cond = Op.getOperand(1);

25372

SDValue Dest = Op.getOperand(2);

25373

SDLoc dl(Op);

25374

25375

if (Cond.getOpcode() == ISD::SETCC &&

25376

Cond.getOperand(0).getValueType() != MVT::f128) {

25377

SDValue LHS = Cond.getOperand(0);

25378

SDValue RHS = Cond.getOperand(1);

25379

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

25380

25381

// Special case for

25382

// setcc([su]{add,sub,mul}o == 0)

25383

// setcc([su]{add,sub,mul}o != 1)

25384

if (ISD::isOverflowIntrOpRes(LHS) &&

25385

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

25386

(isNullConstant(RHS) || isOneConstant(RHS))) {

25387

SDValue Value, Overflow;

25388

X86::CondCode X86Cond;

25389

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

25390

25391

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

25392

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

25393

25394

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25395

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25396

Overflow);

25397

}

25398

25399

if (LHS.getSimpleValueType().isInteger()) {

25400

SDValue CCVal;

25401

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

25402

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25403

EFLAGS);

25404

}

25405

25406

if (CC == ISD::SETOEQ) {

25407

// For FCMP_OEQ, we can emit

25408

// two branches instead of an explicit AND instruction with a

25409

// separate test. However, we only do this if this block doesn't

25410

// have a fall-through edge, because this requires an explicit

25411

// jmp when the condition is false.

25412

if (Op.getNode()->hasOneUse()) {

25413

SDNode *User = *Op.getNode()->use_begin();

25414

// Look for an unconditional branch following this conditional branch.

25415

// We need this because we need to reverse the successors in order

25416

// to implement FCMP_OEQ.

25417

if (User->getOpcode() == ISD::BR) {

25418

SDValue FalseBB = User->getOperand(1);

25419

SDNode *NewBR =

25420

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

25421

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
25421, __extension__ __PRETTY_FUNCTION__));

25422

(void)NewBR;

25423

Dest = FalseBB;

25424

25425

SDValue Cmp =

25426

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

25427

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

25428

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

25429

CCVal, Cmp);

25430

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

25431

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25432

Cmp);

25433

}

25434

}

25435

} else if (CC == ISD::SETUNE) {

25436

// For FCMP_UNE, we can emit

25437

// two branches instead of an explicit OR instruction with a

25438

// separate test.

25439

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

25440

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

25441

Chain =

25442

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

25443

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

25444

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25445

Cmp);

25446

} else {

25447

X86::CondCode X86Cond =

25448

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

25449

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

25450

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25451

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25452

Cmp);

25453

}

25454

}

25455

25456

if (ISD::isOverflowIntrOpRes(Cond)) {

25457

SDValue Value, Overflow;

25458

X86::CondCode X86Cond;

25459

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

25460

25461

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25462

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25463

Overflow);

25464

}

25465

25466

// Look past the truncate if the high bits are known zero.

25467

if (isTruncWithZeroHighBitsInput(Cond, DAG))

25468

Cond = Cond.getOperand(0);

25469

25470

EVT CondVT = Cond.getValueType();

25471

25472

// Add an AND with 1 if we don't already have one.

25473

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

25474

Cond =

25475

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

25476

25477

SDValue LHS = Cond;

25478

SDValue RHS = DAG.getConstant(0, dl, CondVT);

25479

25480

SDValue CCVal;

25481

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

25482

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

25483

EFLAGS);

25484

}

25485

25486

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

25487

// Calls to _alloca are needed to probe the stack when allocating more than 4k

25488

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

25489

// that the guard pages used by the OS virtual memory manager are allocated in

25490

// correct sequence.

25491

SDValue

25492

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

25493

SelectionDAG &DAG) const {

25494

MachineFunction &MF = DAG.getMachineFunction();

25495

bool SplitStack = MF.shouldSplitStack();

25496

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

25497

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

25498

SplitStack || EmitStackProbeCall;

25499

SDLoc dl(Op);

25500

25501

// Get the inputs.

25502

SDNode *Node = Op.getNode();

25503

SDValue Chain = Op.getOperand(0);

25504

SDValue Size = Op.getOperand(1);

25505

MaybeAlign Alignment(Op.getConstantOperandVal(2));

25506

EVT VT = Node->getValueType(0);

25507

25508

// Chain the dynamic stack allocation so that it doesn't modify the stack

25509

// pointer when other instructions are using the stack.

25510

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

25511

25512

bool Is64Bit = Subtarget.is64Bit();

25513

MVT SPTy = getPointerTy(DAG.getDataLayout());

25514

25515

SDValue Result;

25516

if (!Lower) {

25517

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25518

Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

25519

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25520, __extension__
__PRETTY_FUNCTION__))

25520

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25520, __extension__
__PRETTY_FUNCTION__));

25521

25522

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

25523

const Align StackAlign = TFI.getStackAlign();

25524

if (hasInlineStackProbe(MF)) {

25525

MachineRegisterInfo &MRI = MF.getRegInfo();

25526

25527

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

25528

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

25529

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

25530

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

25531

DAG.getRegister(Vreg, SPTy));

25532

} else {

25533

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

25534

Chain = SP.getValue(1);

25535

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

25536

}

25537

if (Alignment && *Alignment > StackAlign)

25538

Result =

25539

DAG.getNode(ISD::AND, dl, VT, Result,

25540

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

25541

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

25542

} else if (SplitStack) {

25543

MachineRegisterInfo &MRI = MF.getRegInfo();

25544

25545

if (Is64Bit) {

25546

// The 64 bit implementation of segmented stacks needs to clobber both r10

25547

// r11. This makes it impossible to use it along with nested parameters.

25548

const Function &F = MF.getFunction();

25549

for (const auto &A : F.args()) {

25550

if (A.hasNestAttr())

25551

report_fatal_error("Cannot use segmented stacks with functions that "

25552

"have nested arguments.");

25553

}

25554

}

25555

25556

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

25557

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

25558

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

25559

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

25560

DAG.getRegister(Vreg, SPTy));

25561

} else {

25562

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

25563

Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

25564

MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);

25565

25566

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

25567

Register SPReg = RegInfo->getStackRegister();

25568

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

25569

Chain = SP.getValue(1);

25570

25571

if (Alignment) {

25572

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

25573

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

25574

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

25575

}

25576

25577

Result = SP;

25578

}

25579

25580

Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),

25581

DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

25582

25583

SDValue Ops[2] = {Result, Chain};

25584

return DAG.getMergeValues(Ops, dl);

25585

}

25586

25587

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

25588

MachineFunction &MF = DAG.getMachineFunction();

25589

auto PtrVT = getPointerTy(MF.getDataLayout());

25590

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

25591

25592

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

25593

SDLoc DL(Op);

25594

25595

if (!Subtarget.is64Bit() ||

25596

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

25597

// vastart just stores the address of the VarArgsFrameIndex slot into the

25598

// memory location argument.

25599

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

25600

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

25601

MachinePointerInfo(SV));

25602

}

25603

25604

// __va_list_tag:

25605

// gp_offset (0 - 6 * 8)

25606

// fp_offset (48 - 48 + 8 * 16)

25607

// overflow_arg_area (point to parameters coming in memory).

25608

// reg_save_area

25609

SmallVector<SDValue, 8> MemOps;

25610

SDValue FIN = Op.getOperand(1);

25611

// Store gp_offset

25612

SDValue Store = DAG.getStore(

25613

Op.getOperand(0), DL,

25614

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

25615

MachinePointerInfo(SV));

25616

MemOps.push_back(Store);

25617

25618

// Store fp_offset

25619

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

25620

Store = DAG.getStore(

25621

Op.getOperand(0), DL,

25622

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

25623

MachinePointerInfo(SV, 4));

25624

MemOps.push_back(Store);

25625

25626

// Store ptr to overflow_arg_area

25627

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

25628

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

25629

Store =

25630

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

25631

MemOps.push_back(Store);

25632

25633

// Store ptr to reg_save_area.

25634

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

25635

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

25636

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

25637

Store = DAG.getStore(

25638

Op.getOperand(0), DL, RSFIN, FIN,

25639

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

25640

MemOps.push_back(Store);

25641

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

25642

}

25643

25644

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

25645

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25646, __extension__
__PRETTY_FUNCTION__))

25646

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25646, __extension__
__PRETTY_FUNCTION__));

25647

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25647, __extension__ __PRETTY_FUNCTION__));

25648

25649

MachineFunction &MF = DAG.getMachineFunction();

25650

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

25651

// The Win64 ABI uses char* instead of a structure.

25652

return DAG.expandVAArg(Op.getNode());

25653

25654

SDValue Chain = Op.getOperand(0);

25655

SDValue SrcPtr = Op.getOperand(1);

25656

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

25657

unsigned Align = Op.getConstantOperandVal(3);

25658

SDLoc dl(Op);

25659

25660

EVT ArgVT = Op.getNode()->getValueType(0);

25661

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

25662

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

25663

uint8_t ArgMode;

25664

25665

// Decide which area this value should be read from.

25666

// TODO: Implement the AMD64 ABI in its entirety. This simple

25667

// selection mechanism works only for the basic types.

25668

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25668, __extension__
__PRETTY_FUNCTION__));

25669

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

25670

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

25671

} else {

25672

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25673, __extension__
__PRETTY_FUNCTION__))

25673

"Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25673, __extension__
__PRETTY_FUNCTION__));

25674

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

25675

}

25676

25677

if (ArgMode == 2) {

25678

// Make sure using fp_offset makes sense.

25679

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))

25680

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__))

25681

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25681, __extension__
__PRETTY_FUNCTION__));

25682

}

25683

25684

// Insert VAARG node into the DAG

25685

// VAARG returns two values: Variable Argument Address, Chain

25686

SDValue InstOps[] = {Chain, SrcPtr,

25687

DAG.getTargetConstant(ArgSize, dl, MVT::i32),

25688

DAG.getTargetConstant(ArgMode, dl, MVT::i8),

25689

DAG.getTargetConstant(Align, dl, MVT::i32)};

25690

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

25691

SDValue VAARG = DAG.getMemIntrinsicNode(

25692

Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

25693

VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

25694

/*Alignment=*/None,

25695

MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

25696

Chain = VAARG.getValue(1);

25697

25698

// Load the next argument and return it

25699

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

25700

}

25701

25702

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

25703

SelectionDAG &DAG) {

25704

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

25705

// where a va_list is still an i8*.

25706

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25706, __extension__
__PRETTY_FUNCTION__));

25707

if (Subtarget.isCallingConvWin64(

25708

DAG.getMachineFunction().getFunction().getCallingConv()))

25709

// Probably a Win64 va_copy.

25710

return DAG.expandVACopy(Op.getNode());

25711

25712

SDValue Chain = Op.getOperand(0);

25713

SDValue DstPtr = Op.getOperand(1);

25714

SDValue SrcPtr = Op.getOperand(2);

25715

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

25716

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

25717

SDLoc DL(Op);

25718

25719

return DAG.getMemcpy(

25720

Chain, DL, DstPtr, SrcPtr,

25721

DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

25722

Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

25723

false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

25724

}

25725

25726

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

25727

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

25728

switch (Opc) {

25729

case ISD::SHL:

25730

case X86ISD::VSHL:

25731

case X86ISD::VSHLI:

25732

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

25733

case ISD::SRL:

25734

case X86ISD::VSRL:

25735

case X86ISD::VSRLI:

25736

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

25737

case ISD::SRA:

25738

case X86ISD::VSRA:

25739

case X86ISD::VSRAI:

25740

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

25741

}

25742

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25742);

25743

}

25744

25745

/// Handle vector element shifts where the shift amount is a constant.

25746

/// Takes immediate version of shift as input.

25747

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

25748

SDValue SrcOp, uint64_t ShiftAmt,

25749

SelectionDAG &DAG) {

25750

MVT ElementType = VT.getVectorElementType();

25751

25752

// Bitcast the source vector to the output type, this is mainly necessary for

25753

// vXi8/vXi64 shifts.

25754

if (VT != SrcOp.getSimpleValueType())

25755

SrcOp = DAG.getBitcast(VT, SrcOp);

25756

25757

// Fold this packed shift into its first operand if ShiftAmt is 0.

25758

if (ShiftAmt == 0)

25759

return SrcOp;

25760

25761

// Check for ShiftAmt >= element width

25762

if (ShiftAmt >= ElementType.getSizeInBits()) {

25763

if (Opc == X86ISD::VSRAI)

25764

ShiftAmt = ElementType.getSizeInBits() - 1;

25765

else

25766

return DAG.getConstant(0, dl, VT);

25767

}

25768

25769

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25770, __extension__
__PRETTY_FUNCTION__))

25770

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25770, __extension__
__PRETTY_FUNCTION__));

25771

25772

// Fold this packed vector shift into a build vector if SrcOp is a

25773

// vector of Constants or UNDEFs.

25774

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

25775

SmallVector<SDValue, 8> Elts;

25776

unsigned NumElts = SrcOp->getNumOperands();

25777

25778

switch (Opc) {

25779

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25779);

25780

case X86ISD::VSHLI:

25781

for (unsigned i = 0; i != NumElts; ++i) {

25782

SDValue CurrentOp = SrcOp->getOperand(i);

25783

if (CurrentOp->isUndef()) {

25784

// Must produce 0s in the correct bits.

25785

Elts.push_back(DAG.getConstant(0, dl, ElementType));

25786

continue;

25787

}

25788

auto *ND = cast<ConstantSDNode>(CurrentOp);

25789

const APInt &C = ND->getAPIntValue();

25790

Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));

25791

}

25792

break;

25793

case X86ISD::VSRLI:

25794

for (unsigned i = 0; i != NumElts; ++i) {

25795

SDValue CurrentOp = SrcOp->getOperand(i);

25796

if (CurrentOp->isUndef()) {

25797

// Must produce 0s in the correct bits.

25798

Elts.push_back(DAG.getConstant(0, dl, ElementType));

25799

continue;

25800

}

25801

auto *ND = cast<ConstantSDNode>(CurrentOp);

25802

const APInt &C = ND->getAPIntValue();

25803

Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));

25804

}

25805

break;

25806

case X86ISD::VSRAI:

25807

for (unsigned i = 0; i != NumElts; ++i) {

25808

SDValue CurrentOp = SrcOp->getOperand(i);

25809

if (CurrentOp->isUndef()) {

25810

// All shifted in bits must be the same so use 0.

25811

Elts.push_back(DAG.getConstant(0, dl, ElementType));

25812

continue;

25813

}

25814

auto *ND = cast<ConstantSDNode>(CurrentOp);

25815

const APInt &C = ND->getAPIntValue();

25816

Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));

25817

}

25818

break;

25819

}

25820

25821

return DAG.getBuildVector(VT, dl, Elts);

25822

}

25823

25824

return DAG.getNode(Opc, dl, VT, SrcOp,

25825

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

25826

}

25827

25828

/// Handle vector element shifts by a splat shift amount

25829

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

25830

SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

25831

const X86Subtarget &Subtarget,

25832

SelectionDAG &DAG) {

25833

MVT AmtVT = ShAmt.getSimpleValueType();

25834

assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25834, __extension__
__PRETTY_FUNCTION__));

25835

assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25836, __extension__
__PRETTY_FUNCTION__))

25836

"Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25836, __extension__
__PRETTY_FUNCTION__));

25837

25838

// Move the splat element to the bottom element.

25839

if (ShAmtIdx != 0) {

25840

SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

25841

Mask[0] = ShAmtIdx;

25842

ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

25843

}

25844

25845

// Peek through any zext node if we can get back to a 128-bit source.

25846

if (AmtVT.getScalarSizeInBits() == 64 &&

25847

(ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

25848

ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

25849

ShAmt.getOperand(0).getValueType().isSimple() &&

25850

ShAmt.getOperand(0).getValueType().is128BitVector()) {

25851

ShAmt = ShAmt.getOperand(0);

25852

AmtVT = ShAmt.getSimpleValueType();

25853

}

25854

25855

// See if we can mask off the upper elements using the existing source node.

25856

// The shift uses the entire lower 64-bits of the amount vector, so no need to

25857

// do this for vXi64 types.

25858

bool IsMasked = false;

25859

if (AmtVT.getScalarSizeInBits() < 64) {

25860

if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

25861

ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

25862

// If the shift amount has come from a scalar, then zero-extend the scalar

25863

// before moving to the vector.

25864

ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

25865

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

25866

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

25867

AmtVT = MVT::v4i32;

25868

IsMasked = true;

25869

} else if (ShAmt.getOpcode() == ISD::AND) {

25870

// See if the shift amount is already masked (e.g. for rotation modulo),

25871

// then we can zero-extend it by setting all the other mask elements to

25872

// zero.

25873

SmallVector<SDValue> MaskElts(

25874

AmtVT.getVectorNumElements(),

25875

DAG.getConstant(0, dl, AmtVT.getScalarType()));

25876

MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

25877

SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

25878

if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

25879

{ShAmt.getOperand(1), Mask}))) {

25880

ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

25881

IsMasked = true;

25882

}

25883

}

25884

}

25885

25886

// Extract if the shift amount vector is larger than 128-bits.

25887

if (AmtVT.getSizeInBits() > 128) {

25888

ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

25889

AmtVT = ShAmt.getSimpleValueType();

25890

}

25891

25892

// Zero-extend bottom element to v2i64 vector type, either by extension or

25893

// shuffle masking.

25894

if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

25895

if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

25896

ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

25897

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

25898

} else if (Subtarget.hasSSE41()) {

25899

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

25900

MVT::v2i64, ShAmt);

25901

} else {

25902

SDValue ByteShift = DAG.getTargetConstant(

25903

(128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

25904

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

25905

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

25906

ByteShift);

25907

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

25908

ByteShift);

25909

}

25910

}

25911

25912

// Change opcode to non-immediate version.

25913

Opc = getTargetVShiftUniformOpcode(Opc, true);

25914

25915

// The return type has to be a 128-bit type with the same element

25916

// type as the input type.

25917

MVT EltVT = VT.getVectorElementType();

25918

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

25919

25920

ShAmt = DAG.getBitcast(ShVT, ShAmt);

25921

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

25922

}

25923

25924

/// Return Mask with the necessary casting or extending

25925

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

25926

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

25927

const X86Subtarget &Subtarget, SelectionDAG &DAG,

25928

const SDLoc &dl) {

25929

25930

if (isAllOnesConstant(Mask))

25931

return DAG.getConstant(1, dl, MaskVT);

25932

if (X86::isZeroNode(Mask))

25933

return DAG.getConstant(0, dl, MaskVT);

25934

25935

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25935, __extension__
__PRETTY_FUNCTION__));

25936

25937

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

25938

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25938, __extension__
__PRETTY_FUNCTION__));

25939

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25939, __extension__
__PRETTY_FUNCTION__));

25940

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

25941

SDValue Lo, Hi;

25942

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

25943

DAG.getConstant(0, dl, MVT::i32));

25944

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

25945

DAG.getConstant(1, dl, MVT::i32));

25946

25947

Lo = DAG.getBitcast(MVT::v32i1, Lo);

25948

Hi = DAG.getBitcast(MVT::v32i1, Hi);

25949

25950

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

25951

} else {

25952

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

25953

Mask.getSimpleValueType().getSizeInBits());

25954

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

25955

// are extracted by EXTRACT_SUBVECTOR.

25956

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

25957

DAG.getBitcast(BitcastVT, Mask),

25958

DAG.getIntPtrConstant(0, dl));

25959

}

25960

}

25961

25962

/// Return (and \p Op, \p Mask) for compare instructions or

25963

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

25964

/// necessary casting or extending for \p Mask when lowering masking intrinsics

25965

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

25966

SDValue PreservedSrc,

25967

const X86Subtarget &Subtarget,

25968

SelectionDAG &DAG) {

25969

MVT VT = Op.getSimpleValueType();

25970

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

25971

unsigned OpcodeSelect = ISD::VSELECT;

25972

SDLoc dl(Op);

25973

25974

if (isAllOnesConstant(Mask))

25975

return Op;

25976

25977

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

25978

25979

if (PreservedSrc.isUndef())

25980

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

25981

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

25982

}

25983

25984

/// Creates an SDNode for a predicated scalar operation.

25985

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

25986

/// The mask is coming as MVT::i8 and it should be transformed

25987

/// to MVT::v1i1 while lowering masking intrinsics.

25988

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

25989

/// "X86select" instead of "vselect". We just can't create the "vselect" node

25990

/// for a scalar instruction.

25991

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

25992

SDValue PreservedSrc,

25993

const X86Subtarget &Subtarget,

25994

SelectionDAG &DAG) {

25995

25996

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

25997

if (MaskConst->getZExtValue() & 0x1)

25998

return Op;

25999

26000

MVT VT = Op.getSimpleValueType();

26001

SDLoc dl(Op);

26002

26003

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26003, __extension__
__PRETTY_FUNCTION__));

26004

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

26005

DAG.getBitcast(MVT::v8i1, Mask),

26006

DAG.getIntPtrConstant(0, dl));

26007

if (Op.getOpcode() == X86ISD::FSETCCM ||

26008

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

26009

Op.getOpcode() == X86ISD::VFPCLASSS)

26010

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

26011

26012

if (PreservedSrc.isUndef())

26013

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

26014

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

26015

}

26016

26017

static int getSEHRegistrationNodeSize(const Function *Fn) {

26018

if (!Fn->hasPersonalityFn())

26019

report_fatal_error(

26020

"querying registration node size for function without personality");

26021

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

26022

// WinEHStatePass for the full struct definition.

26023

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

26024

case EHPersonality::MSVC_X86SEH: return 24;

26025

case EHPersonality::MSVC_CXX: return 16;

26026

default: break;

26027

}

26028

report_fatal_error(

26029

"can only recover FP for 32-bit MSVC EH personality functions");

26030

}

26031

26032

/// When the MSVC runtime transfers control to us, either to an outlined

26033

/// function or when returning to a parent frame after catching an exception, we

26034

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

26035

/// Here's the math:

26036

/// RegNodeBase = EntryEBP - RegNodeSize

26037

/// ParentFP = RegNodeBase - ParentFrameOffset

26038

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

26039

/// subtracting the offset (negative on x86) takes us back to the parent FP.

26040

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

26041

SDValue EntryEBP) {

26042

MachineFunction &MF = DAG.getMachineFunction();

26043

SDLoc dl;

26044

26045

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26046

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

26047

26048

// It's possible that the parent function no longer has a personality function

26049

// if the exceptional code was optimized away, in which case we just return

26050

// the incoming EBP.

26051

if (!Fn->hasPersonalityFn())

26052

return EntryEBP;

26053

26054

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

26055

// registration, or the .set_setframe offset.

26056

MCSymbol *OffsetSym =

26057

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

26058

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

26059

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

26060

SDValue ParentFrameOffset =

26061

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

26062

26063

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

26064

// prologue to RBP in the parent function.

26065

const X86Subtarget &Subtarget =

26066

static_cast<const X86Subtarget &>(DAG.getSubtarget());

26067

if (Subtarget.is64Bit())

26068

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

26069

26070

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

26071

// RegNodeBase = EntryEBP - RegNodeSize

26072

// ParentFP = RegNodeBase - ParentFrameOffset

26073

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

26074

DAG.getConstant(RegNodeSize, dl, PtrVT));

26075

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

26076

}

26077

26078

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

26079

SelectionDAG &DAG) const {

26080

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

26081

auto isRoundModeCurDirection = [](SDValue Rnd) {

26082

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

26083

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

26084

26085

return false;

26086

};

26087

auto isRoundModeSAE = [](SDValue Rnd) {

26088

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

26089

unsigned RC = C->getZExtValue();

26090

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

26091

// Clear the NO_EXC bit and check remaining bits.

26092

RC ^= X86::STATIC_ROUNDING::NO_EXC;

26093

// As a convenience we allow no other bits or explicitly

26094

// current direction.

26095

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

26096

}

26097

}

26098

26099

return false;

26100

};

26101

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

26102

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

26103

RC = C->getZExtValue();

26104

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

26105

// Clear the NO_EXC bit and check remaining bits.

26106

RC ^= X86::STATIC_ROUNDING::NO_EXC;

26107

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

26108

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

26109

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

26110

RC == X86::STATIC_ROUNDING::TO_ZERO;

26111

}

26112

}

26113

26114

return false;

26115

};

26116

26117

SDLoc dl(Op);

26118

unsigned IntNo = Op.getConstantOperandVal(0);

26119

MVT VT = Op.getSimpleValueType();

26120

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

26121

26122

// Propagate flags from original node to transformed node(s).

26123

SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());

26124

26125

if (IntrData) {

26126

switch(IntrData->Type) {

26127

case INTR_TYPE_1OP: {

26128

// We specify 2 possible opcodes for intrinsics with rounding modes.

26129

// First, we check if the intrinsic may have non-default rounding mode,

26130

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

26131

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

26132

if (IntrWithRoundingModeOpcode != 0) {

26133

SDValue Rnd = Op.getOperand(2);

26134

unsigned RC = 0;

26135

if (isRoundModeSAEToX(Rnd, RC))

26136

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

26137

Op.getOperand(1),

26138

DAG.getTargetConstant(RC, dl, MVT::i32));

26139

if (!isRoundModeCurDirection(Rnd))

26140

return SDValue();

26141

}

26142

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26143

Op.getOperand(1));

26144

}

26145

case INTR_TYPE_1OP_SAE: {

26146

SDValue Sae = Op.getOperand(2);

26147

26148

unsigned Opc;

26149

if (isRoundModeCurDirection(Sae))

26150

Opc = IntrData->Opc0;

26151

else if (isRoundModeSAE(Sae))

26152

Opc = IntrData->Opc1;

26153

else

26154

return SDValue();

26155

26156

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

26157

}

26158

case INTR_TYPE_2OP: {

26159

SDValue Src2 = Op.getOperand(2);

26160

26161

// We specify 2 possible opcodes for intrinsics with rounding modes.

26162

// First, we check if the intrinsic may have non-default rounding mode,

26163

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

26164

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

26165

if (IntrWithRoundingModeOpcode != 0) {

26166

SDValue Rnd = Op.getOperand(3);

26167

unsigned RC = 0;

26168

if (isRoundModeSAEToX(Rnd, RC))

26169

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

26170

Op.getOperand(1), Src2,

26171

DAG.getTargetConstant(RC, dl, MVT::i32));

26172

if (!isRoundModeCurDirection(Rnd))

26173

return SDValue();

26174

}

26175

26176

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26177

Op.getOperand(1), Src2);

26178

}

26179

case INTR_TYPE_2OP_SAE: {

26180

SDValue Sae = Op.getOperand(3);

26181

26182

unsigned Opc;

26183

if (isRoundModeCurDirection(Sae))

26184

Opc = IntrData->Opc0;

26185

else if (isRoundModeSAE(Sae))

26186

Opc = IntrData->Opc1;

26187

else

26188

return SDValue();

26189

26190

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

26191

Op.getOperand(2));

26192

}

26193

case INTR_TYPE_3OP:

26194

case INTR_TYPE_3OP_IMM8: {

26195

SDValue Src1 = Op.getOperand(1);

26196

SDValue Src2 = Op.getOperand(2);

26197

SDValue Src3 = Op.getOperand(3);

26198

26199

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

26200

Src3.getValueType() != MVT::i8) {

26201

Src3 = DAG.getTargetConstant(

26202

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

26203

}

26204

26205

// We specify 2 possible opcodes for intrinsics with rounding modes.

26206

// First, we check if the intrinsic may have non-default rounding mode,

26207

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

26208

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

26209

if (IntrWithRoundingModeOpcode != 0) {

26210

SDValue Rnd = Op.getOperand(4);

26211

unsigned RC = 0;

26212

if (isRoundModeSAEToX(Rnd, RC))

26213

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

26214

Src1, Src2, Src3,

26215

DAG.getTargetConstant(RC, dl, MVT::i32));

26216

if (!isRoundModeCurDirection(Rnd))

26217

return SDValue();

26218

}

26219

26220

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26221

{Src1, Src2, Src3});

26222

}

26223

case INTR_TYPE_4OP_IMM8: {

26224

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26224, __extension__
__PRETTY_FUNCTION__));

26225

SDValue Src4 = Op.getOperand(4);

26226

if (Src4.getValueType() != MVT::i8) {

26227

Src4 = DAG.getTargetConstant(

26228

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

26229

}

26230

26231

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26232

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

26233

Src4);

26234

}

26235

case INTR_TYPE_1OP_MASK: {

26236

SDValue Src = Op.getOperand(1);

26237

SDValue PassThru = Op.getOperand(2);

26238

SDValue Mask = Op.getOperand(3);

26239

// We add rounding mode to the Node when

26240

// - RC Opcode is specified and

26241

// - RC is not "current direction".

26242

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

26243

if (IntrWithRoundingModeOpcode != 0) {

26244

SDValue Rnd = Op.getOperand(4);

26245

unsigned RC = 0;

26246

if (isRoundModeSAEToX(Rnd, RC))

26247

return getVectorMaskingNode(

26248

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

26249

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

26250

Mask, PassThru, Subtarget, DAG);

26251

if (!isRoundModeCurDirection(Rnd))

26252

return SDValue();

26253

}

26254

return getVectorMaskingNode(

26255

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

26256

Subtarget, DAG);

26257

}

26258

case INTR_TYPE_1OP_MASK_SAE: {

26259

SDValue Src = Op.getOperand(1);

26260

SDValue PassThru = Op.getOperand(2);

26261

SDValue Mask = Op.getOperand(3);

26262

SDValue Rnd = Op.getOperand(4);

26263

26264

unsigned Opc;

26265

if (isRoundModeCurDirection(Rnd))

26266

Opc = IntrData->Opc0;

26267

else if (isRoundModeSAE(Rnd))

26268

Opc = IntrData->Opc1;

26269

else

26270

return SDValue();

26271

26272

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

26273

Subtarget, DAG);

26274

}

26275

case INTR_TYPE_SCALAR_MASK: {

26276

SDValue Src1 = Op.getOperand(1);

26277

SDValue Src2 = Op.getOperand(2);

26278

SDValue passThru = Op.getOperand(3);

26279

SDValue Mask = Op.getOperand(4);

26280

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

26281

// There are 2 kinds of intrinsics in this group:

26282

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

26283

// (2) With rounding mode and sae - 7 operands.

26284

bool HasRounding = IntrWithRoundingModeOpcode != 0;

26285

if (Op.getNumOperands() == (5U + HasRounding)) {

26286

if (HasRounding) {

26287

SDValue Rnd = Op.getOperand(5);

26288

unsigned RC = 0;

26289

if (isRoundModeSAEToX(Rnd, RC))

26290

return getScalarMaskingNode(

26291

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

26292

DAG.getTargetConstant(RC, dl, MVT::i32)),

26293

Mask, passThru, Subtarget, DAG);

26294

if (!isRoundModeCurDirection(Rnd))

26295

return SDValue();

26296

}

26297

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

26298

Src2),

26299

Mask, passThru, Subtarget, DAG);

26300

}

26301

26302

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26303, __extension__
__PRETTY_FUNCTION__))

26303

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26303, __extension__
__PRETTY_FUNCTION__));

26304

SDValue RoundingMode = Op.getOperand(5);

26305

unsigned Opc = IntrData->Opc0;

26306

if (HasRounding) {

26307

SDValue Sae = Op.getOperand(6);

26308

if (isRoundModeSAE(Sae))

26309

Opc = IntrWithRoundingModeOpcode;

26310

else if (!isRoundModeCurDirection(Sae))

26311

return SDValue();

26312

}

26313

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

26314

Src2, RoundingMode),

26315

Mask, passThru, Subtarget, DAG);

26316

}

26317

case INTR_TYPE_SCALAR_MASK_RND: {

26318

SDValue Src1 = Op.getOperand(1);

26319

SDValue Src2 = Op.getOperand(2);

26320

SDValue passThru = Op.getOperand(3);

26321

SDValue Mask = Op.getOperand(4);

26322

SDValue Rnd = Op.getOperand(5);

26323

26324

SDValue NewOp;

26325

unsigned RC = 0;

26326

if (isRoundModeCurDirection(Rnd))

26327

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

26328

else if (isRoundModeSAEToX(Rnd, RC))

26329

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

26330

DAG.getTargetConstant(RC, dl, MVT::i32));

26331

else

26332

return SDValue();

26333

26334

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

26335

}

26336

case INTR_TYPE_SCALAR_MASK_SAE: {

26337

SDValue Src1 = Op.getOperand(1);

26338

SDValue Src2 = Op.getOperand(2);

26339

SDValue passThru = Op.getOperand(3);

26340

SDValue Mask = Op.getOperand(4);

26341

SDValue Sae = Op.getOperand(5);

26342

unsigned Opc;

26343

if (isRoundModeCurDirection(Sae))

26344

Opc = IntrData->Opc0;

26345

else if (isRoundModeSAE(Sae))

26346

Opc = IntrData->Opc1;

26347

else

26348

return SDValue();

26349

26350

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

26351

Mask, passThru, Subtarget, DAG);

26352

}

26353

case INTR_TYPE_2OP_MASK: {

26354

SDValue Src1 = Op.getOperand(1);

26355

SDValue Src2 = Op.getOperand(2);

26356

SDValue PassThru = Op.getOperand(3);

26357

SDValue Mask = Op.getOperand(4);

26358

SDValue NewOp;

26359

if (IntrData->Opc1 != 0) {

26360

SDValue Rnd = Op.getOperand(5);

26361

unsigned RC = 0;

26362

if (isRoundModeSAEToX(Rnd, RC))

26363

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

26364

DAG.getTargetConstant(RC, dl, MVT::i32));

26365

else if (!isRoundModeCurDirection(Rnd))

26366

return SDValue();

26367

}

26368

if (!NewOp)

26369

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

26370

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

26371

}

26372

case INTR_TYPE_2OP_MASK_SAE: {

26373

SDValue Src1 = Op.getOperand(1);

26374

SDValue Src2 = Op.getOperand(2);

26375

SDValue PassThru = Op.getOperand(3);

26376

SDValue Mask = Op.getOperand(4);

26377

26378

unsigned Opc = IntrData->Opc0;

26379

if (IntrData->Opc1 != 0) {

26380

SDValue Sae = Op.getOperand(5);

26381

if (isRoundModeSAE(Sae))

26382

Opc = IntrData->Opc1;

26383

else if (!isRoundModeCurDirection(Sae))

26384

return SDValue();

26385

}

26386

26387

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

26388

Mask, PassThru, Subtarget, DAG);

26389

}

26390

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

26391

SDValue Src1 = Op.getOperand(1);

26392

SDValue Src2 = Op.getOperand(2);

26393

SDValue Src3 = Op.getOperand(3);

26394

SDValue PassThru = Op.getOperand(4);

26395

SDValue Mask = Op.getOperand(5);

26396

SDValue Sae = Op.getOperand(6);

26397

unsigned Opc;

26398

if (isRoundModeCurDirection(Sae))

26399

Opc = IntrData->Opc0;

26400

else if (isRoundModeSAE(Sae))

26401

Opc = IntrData->Opc1;

26402

else

26403

return SDValue();

26404

26405

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

26406

Mask, PassThru, Subtarget, DAG);

26407

}

26408

case INTR_TYPE_3OP_MASK_SAE: {

26409

SDValue Src1 = Op.getOperand(1);

26410

SDValue Src2 = Op.getOperand(2);

26411

SDValue Src3 = Op.getOperand(3);

26412

SDValue PassThru = Op.getOperand(4);

26413

SDValue Mask = Op.getOperand(5);

26414

26415

unsigned Opc = IntrData->Opc0;

26416

if (IntrData->Opc1 != 0) {

26417

SDValue Sae = Op.getOperand(6);

26418

if (isRoundModeSAE(Sae))

26419

Opc = IntrData->Opc1;

26420

else if (!isRoundModeCurDirection(Sae))

26421

return SDValue();

26422

}

26423

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

26424

Mask, PassThru, Subtarget, DAG);

26425

}

26426

case BLENDV: {

26427

SDValue Src1 = Op.getOperand(1);

26428

SDValue Src2 = Op.getOperand(2);

26429

SDValue Src3 = Op.getOperand(3);

26430

26431

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

26432

Src3 = DAG.getBitcast(MaskVT, Src3);

26433

26434

// Reverse the operands to match VSELECT order.

26435

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

26436

}

26437

case VPERM_2OP : {

26438

SDValue Src1 = Op.getOperand(1);

26439

SDValue Src2 = Op.getOperand(2);

26440

26441

// Swap Src1 and Src2 in the node creation

26442

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

26443

}

26444

case CFMA_OP_MASKZ:

26445

case CFMA_OP_MASK: {

26446

SDValue Src1 = Op.getOperand(1);

26447

SDValue Src2 = Op.getOperand(2);

26448

SDValue Src3 = Op.getOperand(3);

26449

SDValue Mask = Op.getOperand(4);

26450

MVT VT = Op.getSimpleValueType();

26451

26452

SDValue PassThru = Src3;

26453

if (IntrData->Type == CFMA_OP_MASKZ)

26454

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

26455

26456

// We add rounding mode to the Node when

26457

// - RC Opcode is specified and

26458

// - RC is not "current direction".

26459

SDValue NewOp;

26460

if (IntrData->Opc1 != 0) {

26461

SDValue Rnd = Op.getOperand(5);

26462

unsigned RC = 0;

26463

if (isRoundModeSAEToX(Rnd, RC))

26464

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

26465

DAG.getTargetConstant(RC, dl, MVT::i32));

26466

else if (!isRoundModeCurDirection(Rnd))

26467

return SDValue();

26468

}

26469

if (!NewOp)

26470

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

26471

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

26472

}

26473

case IFMA_OP:

26474

// NOTE: We need to swizzle the operands to pass the multiply operands

26475

// first.

26476

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26477

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

26478

case FPCLASSS: {

26479

SDValue Src1 = Op.getOperand(1);

26480

SDValue Imm = Op.getOperand(2);

26481

SDValue Mask = Op.getOperand(3);

26482

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

26483

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

26484

Subtarget, DAG);

26485

// Need to fill with zeros to ensure the bitcast will produce zeroes

26486

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

26487

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

26488

DAG.getConstant(0, dl, MVT::v8i1),

26489

FPclassMask, DAG.getIntPtrConstant(0, dl));

26490

return DAG.getBitcast(MVT::i8, Ins);

26491

}

26492

26493

case CMP_MASK_CC: {

26494

MVT MaskVT = Op.getSimpleValueType();

26495

SDValue CC = Op.getOperand(3);

26496

SDValue Mask = Op.getOperand(4);

26497

// We specify 2 possible opcodes for intrinsics with rounding modes.

26498

// First, we check if the intrinsic may have non-default rounding mode,

26499

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

26500

if (IntrData->Opc1 != 0) {

26501

SDValue Sae = Op.getOperand(5);

26502

if (isRoundModeSAE(Sae))

26503

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

26504

Op.getOperand(2), CC, Mask, Sae);

26505

if (!isRoundModeCurDirection(Sae))

26506

return SDValue();

26507

}

26508

//default rounding mode

26509

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

26510

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

26511

}

26512

case CMP_MASK_SCALAR_CC: {

26513

SDValue Src1 = Op.getOperand(1);

26514

SDValue Src2 = Op.getOperand(2);

26515

SDValue CC = Op.getOperand(3);

26516

SDValue Mask = Op.getOperand(4);

26517

26518

SDValue Cmp;

26519

if (IntrData->Opc1 != 0) {

26520

SDValue Sae = Op.getOperand(5);

26521

if (isRoundModeSAE(Sae))

26522

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

26523

else if (!isRoundModeCurDirection(Sae))

26524

return SDValue();

26525

}

26526

//default rounding mode

26527

if (!Cmp.getNode())

26528

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

26529

26530

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

26531

Subtarget, DAG);

26532

// Need to fill with zeros to ensure the bitcast will produce zeroes

26533

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

26534

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

26535

DAG.getConstant(0, dl, MVT::v8i1),

26536

CmpMask, DAG.getIntPtrConstant(0, dl));

26537

return DAG.getBitcast(MVT::i8, Ins);

26538

}

26539

case COMI: { // Comparison intrinsics

26540

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

26541

SDValue LHS = Op.getOperand(1);

26542

SDValue RHS = Op.getOperand(2);

26543

// Some conditions require the operands to be swapped.

26544

if (CC == ISD::SETLT || CC == ISD::SETLE)

26545

std::swap(LHS, RHS);

26546

26547

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

26548

SDValue SetCC;

26549

switch (CC) {

26550

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

26551

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

26552

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

26553

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

26554

break;

26555

}

26556

case ISD::SETNE: { // (ZF = 1 or PF = 1)

26557

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

26558

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

26559

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

26560

break;

26561

}

26562

case ISD::SETGT: // (CF = 0 and ZF = 0)

26563

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

26564

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

26565

break;

26566

}

26567

case ISD::SETGE: // CF = 0

26568

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

26569

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

26570

break;

26571

default:

26572

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26572);

26573

}

26574

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

26575

}

26576

case COMI_RM: { // Comparison intrinsics with Sae

26577

SDValue LHS = Op.getOperand(1);

26578

SDValue RHS = Op.getOperand(2);

26579

unsigned CondVal = Op.getConstantOperandVal(3);

26580

SDValue Sae = Op.getOperand(4);

26581

26582

SDValue FCmp;

26583

if (isRoundModeCurDirection(Sae))

26584

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

26585

DAG.getTargetConstant(CondVal, dl, MVT::i8));

26586

else if (isRoundModeSAE(Sae))

26587

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

26588

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

26589

else

26590

return SDValue();

26591

// Need to fill with zeros to ensure the bitcast will produce zeroes

26592

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

26593

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

26594

DAG.getConstant(0, dl, MVT::v16i1),

26595

FCmp, DAG.getIntPtrConstant(0, dl));

26596

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

26597

DAG.getBitcast(MVT::i16, Ins));

26598

}

26599

case VSHIFT: {

26600

SDValue SrcOp = Op.getOperand(1);

26601

SDValue ShAmt = Op.getOperand(2);

26602

assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26603, __extension__
__PRETTY_FUNCTION__))

26603

"Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26603, __extension__
__PRETTY_FUNCTION__));

26604

26605

// Catch shift-by-constant.

26606

if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

26607

return getTargetVShiftByConstNode(IntrData->Opc0, dl,

26608

Op.getSimpleValueType(), SrcOp,

26609

CShAmt->getZExtValue(), DAG);

26610

26611

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

26612

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

26613

SrcOp, ShAmt, 0, Subtarget, DAG);

26614

}

26615

case COMPRESS_EXPAND_IN_REG: {

26616

SDValue Mask = Op.getOperand(3);

26617

SDValue DataToCompress = Op.getOperand(1);

26618

SDValue PassThru = Op.getOperand(2);

26619

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

26620

return Op.getOperand(1);

26621

26622

// Avoid false dependency.

26623

if (PassThru.isUndef())

26624

PassThru = DAG.getConstant(0, dl, VT);

26625

26626

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

26627

Mask);

26628

}

26629

case FIXUPIMM:

26630

case FIXUPIMM_MASKZ: {

26631

SDValue Src1 = Op.getOperand(1);

26632

SDValue Src2 = Op.getOperand(2);

26633

SDValue Src3 = Op.getOperand(3);

26634

SDValue Imm = Op.getOperand(4);

26635

SDValue Mask = Op.getOperand(5);

26636

SDValue Passthru = (IntrData->Type == FIXUPIMM)

26637

? Src1

26638

: getZeroVector(VT, Subtarget, DAG, dl);

26639

26640

unsigned Opc = IntrData->Opc0;

26641

if (IntrData->Opc1 != 0) {

26642

SDValue Sae = Op.getOperand(6);

26643

if (isRoundModeSAE(Sae))

26644

Opc = IntrData->Opc1;

26645

else if (!isRoundModeCurDirection(Sae))

26646

return SDValue();

26647

}

26648

26649

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

26650

26651

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

26652

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

26653

26654

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

26655

}

26656

case ROUNDP: {

26657

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26657, __extension__
__PRETTY_FUNCTION__));

26658

// Clear the upper bits of the rounding immediate so that the legacy

26659

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

26660

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

26661

SDValue RoundingMode =

26662

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

26663

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26664

Op.getOperand(1), RoundingMode);

26665

}

26666

case ROUNDS: {

26667

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26667, __extension__
__PRETTY_FUNCTION__));

26668

// Clear the upper bits of the rounding immediate so that the legacy

26669

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

26670

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

26671

SDValue RoundingMode =

26672

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

26673

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26674

Op.getOperand(1), Op.getOperand(2), RoundingMode);

26675

}

26676

case BEXTRI: {

26677

assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26677, __extension__
__PRETTY_FUNCTION__));

26678

26679

uint64_t Imm = Op.getConstantOperandVal(2);

26680

SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

26681

Op.getValueType());

26682

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26683

Op.getOperand(1), Control);

26684

}

26685

// ADC/ADCX/SBB

26686

case ADX: {

26687

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

26688

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

26689

26690

SDValue Res;

26691

// If the carry in is zero, then we should just use ADD/SUB instead of

26692

// ADC/SBB.

26693

if (isNullConstant(Op.getOperand(1))) {

26694

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

26695

Op.getOperand(3));

26696

} else {

26697

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

26698

DAG.getConstant(-1, dl, MVT::i8));

26699

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

26700

Op.getOperand(3), GenCF.getValue(1));

26701

}

26702

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

26703

SDValue Results[] = { SetCC, Res };

26704

return DAG.getMergeValues(Results, dl);

26705

}

26706

case CVTPD2PS_MASK:

26707

case CVTPD2DQ_MASK:

26708

case CVTQQ2PS_MASK:

26709

case TRUNCATE_TO_REG: {

26710

SDValue Src = Op.getOperand(1);

26711

SDValue PassThru = Op.getOperand(2);

26712

SDValue Mask = Op.getOperand(3);

26713

26714

if (isAllOnesConstant(Mask))

26715

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

26716

26717

MVT SrcVT = Src.getSimpleValueType();

26718

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

26719

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

26720

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

26721

{Src, PassThru, Mask});

26722

}

26723

case CVTPS2PH_MASK: {

26724

SDValue Src = Op.getOperand(1);

26725

SDValue Rnd = Op.getOperand(2);

26726

SDValue PassThru = Op.getOperand(3);

26727

SDValue Mask = Op.getOperand(4);

26728

26729

if (isAllOnesConstant(Mask))

26730

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

26731

26732

MVT SrcVT = Src.getSimpleValueType();

26733

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

26734

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

26735

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,

26736

PassThru, Mask);

26737

26738

}

26739

case CVTNEPS2BF16_MASK: {

26740

SDValue Src = Op.getOperand(1);

26741

SDValue PassThru = Op.getOperand(2);

26742

SDValue Mask = Op.getOperand(3);

26743

26744

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

26745

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

26746

26747

// Break false dependency.

26748

if (PassThru.isUndef())

26749

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

26750

26751

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

26752

Mask);

26753

}

26754

default:

26755

break;

26756

}

26757

}

26758

26759

switch (IntNo) {

26760

default: return SDValue(); // Don't custom lower most intrinsics.

26761

26762

// ptest and testp intrinsics. The intrinsic these come from are designed to

26763

// return an integer value, not just an instruction so lower it to the ptest

26764

// or testp pattern and a setcc for the result.

26765

case Intrinsic::x86_avx512_ktestc_b:

26766

case Intrinsic::x86_avx512_ktestc_w:

26767

case Intrinsic::x86_avx512_ktestc_d:

26768

case Intrinsic::x86_avx512_ktestc_q:

26769

case Intrinsic::x86_avx512_ktestz_b:

26770

case Intrinsic::x86_avx512_ktestz_w:

26771

case Intrinsic::x86_avx512_ktestz_d:

26772

case Intrinsic::x86_avx512_ktestz_q:

26773

case Intrinsic::x86_sse41_ptestz:

26774

case Intrinsic::x86_sse41_ptestc:

26775

case Intrinsic::x86_sse41_ptestnzc:

26776

case Intrinsic::x86_avx_ptestz_256:

26777

case Intrinsic::x86_avx_ptestc_256:

26778

case Intrinsic::x86_avx_ptestnzc_256:

26779

case Intrinsic::x86_avx_vtestz_ps:

26780

case Intrinsic::x86_avx_vtestc_ps:

26781

case Intrinsic::x86_avx_vtestnzc_ps:

26782

case Intrinsic::x86_avx_vtestz_pd:

26783

case Intrinsic::x86_avx_vtestc_pd:

26784

case Intrinsic::x86_avx_vtestnzc_pd:

26785

case Intrinsic::x86_avx_vtestz_ps_256:

26786

case Intrinsic::x86_avx_vtestc_ps_256:

26787

case Intrinsic::x86_avx_vtestnzc_ps_256:

26788

case Intrinsic::x86_avx_vtestz_pd_256:

26789

case Intrinsic::x86_avx_vtestc_pd_256:

26790

case Intrinsic::x86_avx_vtestnzc_pd_256: {

26791

unsigned TestOpc = X86ISD::PTEST;

26792

X86::CondCode X86CC;

26793

switch (IntNo) {

26794

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26794);

26795

case Intrinsic::x86_avx512_ktestc_b:

26796

case Intrinsic::x86_avx512_ktestc_w:

26797

case Intrinsic::x86_avx512_ktestc_d:

26798

case Intrinsic::x86_avx512_ktestc_q:

26799

// CF = 1

26800

TestOpc = X86ISD::KTEST;

26801

X86CC = X86::COND_B;

26802

break;

26803

case Intrinsic::x86_avx512_ktestz_b:

26804

case Intrinsic::x86_avx512_ktestz_w:

26805

case Intrinsic::x86_avx512_ktestz_d:

26806

case Intrinsic::x86_avx512_ktestz_q:

26807

TestOpc = X86ISD::KTEST;

26808

X86CC = X86::COND_E;

26809

break;

26810

case Intrinsic::x86_avx_vtestz_ps:

26811

case Intrinsic::x86_avx_vtestz_pd:

26812

case Intrinsic::x86_avx_vtestz_ps_256:

26813

case Intrinsic::x86_avx_vtestz_pd_256:

26814

TestOpc = X86ISD::TESTP;

26815

LLVM_FALLTHROUGH[[gnu::fallthrough]];

26816

case Intrinsic::x86_sse41_ptestz:

26817

case Intrinsic::x86_avx_ptestz_256:

26818

// ZF = 1

26819

X86CC = X86::COND_E;

26820

break;

26821

case Intrinsic::x86_avx_vtestc_ps:

26822

case Intrinsic::x86_avx_vtestc_pd:

26823

case Intrinsic::x86_avx_vtestc_ps_256:

26824

case Intrinsic::x86_avx_vtestc_pd_256:

26825

TestOpc = X86ISD::TESTP;

26826

LLVM_FALLTHROUGH[[gnu::fallthrough]];

26827

case Intrinsic::x86_sse41_ptestc:

26828

case Intrinsic::x86_avx_ptestc_256:

26829

// CF = 1

26830

X86CC = X86::COND_B;

26831

break;

26832

case Intrinsic::x86_avx_vtestnzc_ps:

26833

case Intrinsic::x86_avx_vtestnzc_pd:

26834

case Intrinsic::x86_avx_vtestnzc_ps_256:

26835

case Intrinsic::x86_avx_vtestnzc_pd_256:

26836

TestOpc = X86ISD::TESTP;

26837

LLVM_FALLTHROUGH[[gnu::fallthrough]];

26838

case Intrinsic::x86_sse41_ptestnzc:

26839

case Intrinsic::x86_avx_ptestnzc_256:

26840

// ZF and CF = 0

26841

X86CC = X86::COND_A;

26842

break;

26843

}

26844

26845

SDValue LHS = Op.getOperand(1);

26846

SDValue RHS = Op.getOperand(2);

26847

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

26848

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

26849

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

26850

}

26851

26852

case Intrinsic::x86_sse42_pcmpistria128:

26853

case Intrinsic::x86_sse42_pcmpestria128:

26854

case Intrinsic::x86_sse42_pcmpistric128:

26855

case Intrinsic::x86_sse42_pcmpestric128:

26856

case Intrinsic::x86_sse42_pcmpistrio128:

26857

case Intrinsic::x86_sse42_pcmpestrio128:

26858

case Intrinsic::x86_sse42_pcmpistris128:

26859

case Intrinsic::x86_sse42_pcmpestris128:

26860

case Intrinsic::x86_sse42_pcmpistriz128:

26861

case Intrinsic::x86_sse42_pcmpestriz128: {

26862

unsigned Opcode;

26863

X86::CondCode X86CC;

26864

switch (IntNo) {

26865

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26865); // Can't reach here.

26866

case Intrinsic::x86_sse42_pcmpistria128:

26867

Opcode = X86ISD::PCMPISTR;

26868

X86CC = X86::COND_A;

26869

break;

26870

case Intrinsic::x86_sse42_pcmpestria128:

26871

Opcode = X86ISD::PCMPESTR;

26872

X86CC = X86::COND_A;

26873

break;

26874

case Intrinsic::x86_sse42_pcmpistric128:

26875

Opcode = X86ISD::PCMPISTR;

26876

X86CC = X86::COND_B;

26877

break;

26878

case Intrinsic::x86_sse42_pcmpestric128:

26879

Opcode = X86ISD::PCMPESTR;

26880

X86CC = X86::COND_B;

26881

break;

26882

case Intrinsic::x86_sse42_pcmpistrio128:

26883

Opcode = X86ISD::PCMPISTR;

26884

X86CC = X86::COND_O;

26885

break;

26886

case Intrinsic::x86_sse42_pcmpestrio128:

26887

Opcode = X86ISD::PCMPESTR;

26888

X86CC = X86::COND_O;

26889

break;

26890

case Intrinsic::x86_sse42_pcmpistris128:

26891

Opcode = X86ISD::PCMPISTR;

26892

X86CC = X86::COND_S;

26893

break;

26894

case Intrinsic::x86_sse42_pcmpestris128:

26895

Opcode = X86ISD::PCMPESTR;

26896

X86CC = X86::COND_S;

26897

break;

26898

case Intrinsic::x86_sse42_pcmpistriz128:

26899

Opcode = X86ISD::PCMPISTR;

26900

X86CC = X86::COND_E;

26901

break;

26902

case Intrinsic::x86_sse42_pcmpestriz128:

26903

Opcode = X86ISD::PCMPESTR;

26904

X86CC = X86::COND_E;

26905

break;

26906

}

26907

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

26908

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

26909

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

26910

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

26911

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

26912

}

26913

26914

case Intrinsic::x86_sse42_pcmpistri128:

26915

case Intrinsic::x86_sse42_pcmpestri128: {

26916

unsigned Opcode;

26917

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

26918

Opcode = X86ISD::PCMPISTR;

26919

else

26920

Opcode = X86ISD::PCMPESTR;

26921

26922

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

26923

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

26924

return DAG.getNode(Opcode, dl, VTs, NewOps);

26925

}

26926

26927

case Intrinsic::x86_sse42_pcmpistrm128:

26928

case Intrinsic::x86_sse42_pcmpestrm128: {

26929

unsigned Opcode;

26930

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

26931

Opcode = X86ISD::PCMPISTR;

26932

else

26933

Opcode = X86ISD::PCMPESTR;

26934

26935

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

26936

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

26937

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

26938

}

26939

26940

case Intrinsic::eh_sjlj_lsda: {

26941

MachineFunction &MF = DAG.getMachineFunction();

26942

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26943

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

26944

auto &Context = MF.getMMI().getContext();

26945

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

26946

Twine(MF.getFunctionNumber()));

26947

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

26948

DAG.getMCSymbol(S, PtrVT));

26949

}

26950

26951

case Intrinsic::x86_seh_lsda: {

26952

// Compute the symbol for the LSDA. We know it'll get emitted later.

26953

MachineFunction &MF = DAG.getMachineFunction();

26954

SDValue Op1 = Op.getOperand(1);

26955

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

26956

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

26957

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

26958

26959

// Generate a simple absolute symbol reference. This intrinsic is only

26960

// supported on 32-bit Windows, which isn't PIC.

26961

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

26962

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

26963

}

26964

26965

case Intrinsic::eh_recoverfp: {

26966

SDValue FnOp = Op.getOperand(1);

26967

SDValue IncomingFPOp = Op.getOperand(2);

26968

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

26969

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

26970

if (!Fn)

26971

report_fatal_error(

26972

"llvm.eh.recoverfp must take a function as the first argument");

26973

return recoverFramePointer(DAG, Fn, IncomingFPOp);

26974

}

26975

26976

case Intrinsic::localaddress: {

26977

// Returns one of the stack, base, or frame pointer registers, depending on

26978

// which is used to reference local variables.

26979

MachineFunction &MF = DAG.getMachineFunction();

26980

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26981

unsigned Reg;

26982

if (RegInfo->hasBasePointer(MF))

26983

Reg = RegInfo->getBaseRegister();

26984

else { // Handles the SP or FP case.

26985

bool CantUseFP = RegInfo->hasStackRealignment(MF);

26986

if (CantUseFP)

26987

Reg = RegInfo->getPtrSizedStackRegister(MF);

26988

else

26989

Reg = RegInfo->getPtrSizedFrameRegister(MF);

26990

}

26991

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

26992

}

26993

case Intrinsic::swift_async_context_addr: {

26994

auto &MF = DAG.getMachineFunction();

26995

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

26996

if (Subtarget.is64Bit()) {

26997

MF.getFrameInfo().setFrameAddressIsTaken(true);

26998

X86FI->setHasSwiftAsyncContext(true);

26999

return SDValue(

27000

DAG.getMachineNode(

27001

X86::SUB64ri8, dl, MVT::i64,

27002

DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),

27003

DAG.getTargetConstant(8, dl, MVT::i32)),

27004

0);

27005

} else {

27006

// 32-bit so no special extended frame, create or reuse an existing stack

27007

// slot.

27008

if (!X86FI->getSwiftAsyncContextFrameIdx())

27009

X86FI->setSwiftAsyncContextFrameIdx(

27010

MF.getFrameInfo().CreateStackObject(4, Align(4), false));

27011

return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);

27012

}

27013

}

27014

case Intrinsic::x86_avx512_vp2intersect_q_512:

27015

case Intrinsic::x86_avx512_vp2intersect_q_256:

27016

case Intrinsic::x86_avx512_vp2intersect_q_128:

27017

case Intrinsic::x86_avx512_vp2intersect_d_512:

27018

case Intrinsic::x86_avx512_vp2intersect_d_256:

27019

case Intrinsic::x86_avx512_vp2intersect_d_128: {

27020

MVT MaskVT = Op.getSimpleValueType();

27021

27022

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

27023

SDLoc DL(Op);

27024

27025

SDValue Operation =

27026

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

27027

Op->getOperand(1), Op->getOperand(2));

27028

27029

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

27030

MaskVT, Operation);

27031

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

27032

MaskVT, Operation);

27033

return DAG.getMergeValues({Result0, Result1}, DL);

27034

}

27035

case Intrinsic::x86_mmx_pslli_w:

27036

case Intrinsic::x86_mmx_pslli_d:

27037

case Intrinsic::x86_mmx_pslli_q:

27038

case Intrinsic::x86_mmx_psrli_w:

27039

case Intrinsic::x86_mmx_psrli_d:

27040

case Intrinsic::x86_mmx_psrli_q:

27041

case Intrinsic::x86_mmx_psrai_w:

27042

case Intrinsic::x86_mmx_psrai_d: {

27043

SDLoc DL(Op);

27044

SDValue ShAmt = Op.getOperand(2);

27045

// If the argument is a constant, convert it to a target constant.

27046

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

27047

// Clamp out of bounds shift amounts since they will otherwise be masked

27048

// to 8-bits which may make it no longer out of bounds.

27049

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

27050

if (ShiftAmount == 0)

27051

return Op.getOperand(1);

27052

27053

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

27054

Op.getOperand(0), Op.getOperand(1),

27055

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

27056

}

27057

27058

unsigned NewIntrinsic;

27059

switch (IntNo) {

27060

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27060); // Can't reach here.

27061

case Intrinsic::x86_mmx_pslli_w:

27062

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

27063

break;

27064

case Intrinsic::x86_mmx_pslli_d:

27065

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

27066

break;

27067

case Intrinsic::x86_mmx_pslli_q:

27068

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

27069

break;

27070

case Intrinsic::x86_mmx_psrli_w:

27071

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

27072

break;

27073

case Intrinsic::x86_mmx_psrli_d:

27074

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

27075

break;

27076

case Intrinsic::x86_mmx_psrli_q:

27077

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

27078

break;

27079

case Intrinsic::x86_mmx_psrai_w:

27080

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

27081

break;

27082

case Intrinsic::x86_mmx_psrai_d:

27083

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

27084

break;

27085

}

27086

27087

// The vector shift intrinsics with scalars uses 32b shift amounts but

27088

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

27089

// MMX register.

27090

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

27091

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

27092

DAG.getTargetConstant(NewIntrinsic, DL,

27093

getPointerTy(DAG.getDataLayout())),

27094

Op.getOperand(1), ShAmt);

27095

}

27096

case Intrinsic::thread_pointer: {

27097

if (Subtarget.isTargetELF()) {

27098

SDLoc dl(Op);

27099

EVT PtrVT = getPointerTy(DAG.getDataLayout());

27100

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

27101

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(

27102

*DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

27103

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

27104

DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

27105

}

27106

report_fatal_error(

27107

"Target OS doesn't support __builtin_thread_pointer() yet.");

27108

}

27109

}

27110

}

27111

27112

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

27113

SDValue Src, SDValue Mask, SDValue Base,

27114

SDValue Index, SDValue ScaleOp, SDValue Chain,

27115

const X86Subtarget &Subtarget) {

27116

SDLoc dl(Op);

27117

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

27118

// Scale must be constant.

27119

if (!C)

27120

return SDValue();

27121

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27122

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

27123

TLI.getPointerTy(DAG.getDataLayout()));

27124

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

27125

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

27126

// If source is undef or we know it won't be used, use a zero vector

27127

// to break register dependency.

27128

// TODO: use undef instead and let BreakFalseDeps deal with it?

27129

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

27130

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

27131

27132

// Cast mask to an integer type.

27133

Mask = DAG.getBitcast(MaskVT, Mask);

27134

27135

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

27136

27137

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

27138

SDValue Res =

27139

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

27140

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

27141

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

27142

}

27143

27144

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

27145

SDValue Src, SDValue Mask, SDValue Base,

27146

SDValue Index, SDValue ScaleOp, SDValue Chain,

27147

const X86Subtarget &Subtarget) {

27148

MVT VT = Op.getSimpleValueType();

27149

SDLoc dl(Op);

27150

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

27151

// Scale must be constant.

27152

if (!C)

27153

return SDValue();

27154

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27155

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

27156

TLI.getPointerTy(DAG.getDataLayout()));

27157

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

27158

VT.getVectorNumElements());

27159

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

27160

27161

// We support two versions of the gather intrinsics. One with scalar mask and

27162

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

27163

if (Mask.getValueType() != MaskVT)

27164

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27165

27166

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

27167

// If source is undef or we know it won't be used, use a zero vector

27168

// to break register dependency.

27169

// TODO: use undef instead and let BreakFalseDeps deal with it?

27170

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

27171

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

27172

27173

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

27174

27175

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

27176

SDValue Res =

27177

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

27178

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

27179

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

27180

}

27181

27182

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

27183

SDValue Src, SDValue Mask, SDValue Base,

27184

SDValue Index, SDValue ScaleOp, SDValue Chain,

27185

const X86Subtarget &Subtarget) {

27186

SDLoc dl(Op);

27187

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

27188

// Scale must be constant.

27189

if (!C)

27190

return SDValue();

27191

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27192

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

27193

TLI.getPointerTy(DAG.getDataLayout()));

27194

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

27195

Src.getSimpleValueType().getVectorNumElements());

27196

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

27197

27198

// We support two versions of the scatter intrinsics. One with scalar mask and

27199

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

27200

if (Mask.getValueType() != MaskVT)

27201

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27202

27203

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

27204

27205

SDVTList VTs = DAG.getVTList(MVT::Other);

27206

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

27207

SDValue Res =

27208

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

27209

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

27210

return Res;

27211

}

27212

27213

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

27214

SDValue Mask, SDValue Base, SDValue Index,

27215

SDValue ScaleOp, SDValue Chain,

27216

const X86Subtarget &Subtarget) {

27217

SDLoc dl(Op);

27218

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

27219

// Scale must be constant.

27220

if (!C)

27221

return SDValue();

27222

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27223

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

27224

TLI.getPointerTy(DAG.getDataLayout()));

27225

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

27226

SDValue Segment = DAG.getRegister(0, MVT::i32);

27227

MVT MaskVT =

27228

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

27229

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27230

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

27231

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

27232

return SDValue(Res, 0);

27233

}

27234

27235

/// Handles the lowering of builtin intrinsics with chain that return their

27236

/// value into registers EDX:EAX.

27237

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

27238

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

27239

/// TargetOpcode.

27240

/// Returns a Glue value which can be used to add extra copy-from-reg if the

27241

/// expanded intrinsics implicitly defines extra registers (i.e. not just

27242

/// EDX:EAX).

27243

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

27244

SelectionDAG &DAG,

27245

unsigned TargetOpcode,

27246

unsigned SrcReg,

27247

const X86Subtarget &Subtarget,

27248

SmallVectorImpl<SDValue> &Results) {

27249

SDValue Chain = N->getOperand(0);

27250

SDValue Glue;

27251

27252

if (SrcReg) {

27253

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27253, __extension__
__PRETTY_FUNCTION__));

27254

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

27255

Glue = Chain.getValue(1);

27256

}

27257

27258

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

27259

SDValue N1Ops[] = {Chain, Glue};

27260

SDNode *N1 = DAG.getMachineNode(

27261

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

27262

Chain = SDValue(N1, 0);

27263

27264

// Reads the content of XCR and returns it in registers EDX:EAX.

27265

SDValue LO, HI;

27266

if (Subtarget.is64Bit()) {

27267

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

27268

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

27269

LO.getValue(2));

27270

} else {

27271

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

27272

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

27273

LO.getValue(2));

27274

}

27275

Chain = HI.getValue(1);

27276

Glue = HI.getValue(2);

27277

27278

if (Subtarget.is64Bit()) {

27279

// Merge the two 32-bit values into a 64-bit one.

27280

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

27281

DAG.getConstant(32, DL, MVT::i8));

27282

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

27283

Results.push_back(Chain);

27284

return Glue;

27285

}

27286

27287

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

27288

SDValue Ops[] = { LO, HI };

27289

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

27290

Results.push_back(Pair);

27291

Results.push_back(Chain);

27292

return Glue;

27293

}

27294

27295

/// Handles the lowering of builtin intrinsics that read the time stamp counter

27296

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

27297

/// READCYCLECOUNTER nodes.

27298

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

27299

SelectionDAG &DAG,

27300

const X86Subtarget &Subtarget,

27301

SmallVectorImpl<SDValue> &Results) {

27302

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

27303

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

27304

// and the EAX register is loaded with the low-order 32 bits.

27305

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

27306

/* NoRegister */0, Subtarget,

27307

Results);

27308

if (Opcode != X86::RDTSCP)

27309

return;

27310

27311

SDValue Chain = Results[1];

27312

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

27313

// the ECX register. Add 'ecx' explicitly to the chain.

27314

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

27315

Results[1] = ecx;

27316

Results.push_back(ecx.getValue(1));

27317

}

27318

27319

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

27320

SelectionDAG &DAG) {

27321

SmallVector<SDValue, 3> Results;

27322

SDLoc DL(Op);

27323

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

27324

Results);

27325

return DAG.getMergeValues(Results, DL);

27326

}

27327

27328

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

27329

MachineFunction &MF = DAG.getMachineFunction();

27330

SDValue Chain = Op.getOperand(0);

27331

SDValue RegNode = Op.getOperand(2);

27332

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

27333

if (!EHInfo)

27334

report_fatal_error("EH registrations only live in functions using WinEH");

27335

27336

// Cast the operand to an alloca, and remember the frame index.

27337

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

27338

if (!FINode)

27339

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

27340

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

27341

27342

// Return the chain operand without making any DAG nodes.

27343

return Chain;

27344

}

27345

27346

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

27347

MachineFunction &MF = DAG.getMachineFunction();

27348

SDValue Chain = Op.getOperand(0);

27349

SDValue EHGuard = Op.getOperand(2);

27350

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

27351

if (!EHInfo)

27352

report_fatal_error("EHGuard only live in functions using WinEH");

27353

27354

// Cast the operand to an alloca, and remember the frame index.

27355

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

27356

if (!FINode)

27357

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

27358

EHInfo->EHGuardFrameIndex = FINode->getIndex();

27359

27360

// Return the chain operand without making any DAG nodes.

27361

return Chain;

27362

}

27363

27364

/// Emit Truncating Store with signed or unsigned saturation.

27365

static SDValue

27366

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

27367

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

27368

SelectionDAG &DAG) {

27369

SDVTList VTs = DAG.getVTList(MVT::Other);

27370

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

27371

SDValue Ops[] = { Chain, Val, Ptr, Undef };

27372

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

27373

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

27374

}

27375

27376

/// Emit Masked Truncating Store with signed or unsigned saturation.

27377

static SDValue

27378

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

27379

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

27380

MachineMemOperand *MMO, SelectionDAG &DAG) {

27381

SDVTList VTs = DAG.getVTList(MVT::Other);

27382

SDValue Ops[] = { Chain, Val, Ptr, Mask };

27383

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

27384

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

27385

}

27386

27387

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

27388

SelectionDAG &DAG) {

27389

unsigned IntNo = Op.getConstantOperandVal(1);

27390

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

27391

if (!IntrData) {

27392

switch (IntNo) {

27393

case llvm::Intrinsic::x86_seh_ehregnode:

27394

return MarkEHRegistrationNode(Op, DAG);

27395

case llvm::Intrinsic::x86_seh_ehguard:

27396

return MarkEHGuard(Op, DAG);

27397

case llvm::Intrinsic::x86_rdpkru: {

27398

SDLoc dl(Op);

27399

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

27400

// Create a RDPKRU node and pass 0 to the ECX parameter.

27401

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

27402

DAG.getConstant(0, dl, MVT::i32));

27403

}

27404

case llvm::Intrinsic::x86_wrpkru: {

27405

SDLoc dl(Op);

27406

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

27407

// to the EDX and ECX parameters.

27408

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

27409

Op.getOperand(0), Op.getOperand(2),

27410

DAG.getConstant(0, dl, MVT::i32),

27411

DAG.getConstant(0, dl, MVT::i32));

27412

}

27413

case llvm::Intrinsic::asan_check_memaccess: {

27414

// Mark this as adjustsStack because it will be lowered to a call.

27415

DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

27416

// Don't do anything here, we will expand these intrinsics out later.

27417

return Op;

27418

}

27419

case llvm::Intrinsic::x86_flags_read_u32:

27420

case llvm::Intrinsic::x86_flags_read_u64:

27421

case llvm::Intrinsic::x86_flags_write_u32:

27422

case llvm::Intrinsic::x86_flags_write_u64: {

27423

// We need a frame pointer because this will get lowered to a PUSH/POP

27424

// sequence.

27425

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

27426

MFI.setHasCopyImplyingStackAdjustment(true);

27427

// Don't do anything here, we will expand these intrinsics out later

27428

// during FinalizeISel in EmitInstrWithCustomInserter.

27429

return Op;

27430

}

27431

case Intrinsic::x86_lwpins32:

27432

case Intrinsic::x86_lwpins64:

27433

case Intrinsic::x86_umwait:

27434

case Intrinsic::x86_tpause: {

27435

SDLoc dl(Op);

27436

SDValue Chain = Op->getOperand(0);

27437

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

27438

unsigned Opcode;

27439

27440

switch (IntNo) {

27441

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27441);

27442

case Intrinsic::x86_umwait:

27443

Opcode = X86ISD::UMWAIT;

27444

break;

27445

case Intrinsic::x86_tpause:

27446

Opcode = X86ISD::TPAUSE;

27447

break;

27448

case Intrinsic::x86_lwpins32:

27449

case Intrinsic::x86_lwpins64:

27450

Opcode = X86ISD::LWPINS;

27451

break;

27452

}

27453

27454

SDValue Operation =

27455

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

27456

Op->getOperand(3), Op->getOperand(4));

27457

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

27458

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

27459

Operation.getValue(1));

27460

}

27461

case Intrinsic::x86_enqcmd:

27462

case Intrinsic::x86_enqcmds: {

27463

SDLoc dl(Op);

27464

SDValue Chain = Op.getOperand(0);

27465

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

27466

unsigned Opcode;

27467

switch (IntNo) {

27468

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27468);

27469

case Intrinsic::x86_enqcmd:

27470

Opcode = X86ISD::ENQCMD;

27471

break;

27472

case Intrinsic::x86_enqcmds:

27473

Opcode = X86ISD::ENQCMDS;

27474

break;

27475

}

27476

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

27477

Op.getOperand(3));

27478

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

27479

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

27480

Operation.getValue(1));

27481

}

27482

case Intrinsic::x86_aesenc128kl:

27483

case Intrinsic::x86_aesdec128kl:

27484

case Intrinsic::x86_aesenc256kl:

27485

case Intrinsic::x86_aesdec256kl: {

27486

SDLoc DL(Op);

27487

SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

27488

SDValue Chain = Op.getOperand(0);

27489

unsigned Opcode;

27490

27491

switch (IntNo) {

27492

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27492);

27493

case Intrinsic::x86_aesenc128kl:

27494

Opcode = X86ISD::AESENC128KL;

27495

break;

27496

case Intrinsic::x86_aesdec128kl:

27497

Opcode = X86ISD::AESDEC128KL;

27498

break;

27499

case Intrinsic::x86_aesenc256kl:

27500

Opcode = X86ISD::AESENC256KL;

27501

break;

27502

case Intrinsic::x86_aesdec256kl:

27503

Opcode = X86ISD::AESDEC256KL;

27504

break;

27505

}

27506

27507

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

27508

MachineMemOperand *MMO = MemIntr->getMemOperand();

27509

EVT MemVT = MemIntr->getMemoryVT();

27510

SDValue Operation = DAG.getMemIntrinsicNode(

27511

Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

27512

MMO);

27513

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);

27514

27515

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

27516

{ZF, Operation.getValue(0), Operation.getValue(2)});

27517

}

27518

case Intrinsic::x86_aesencwide128kl:

27519

case Intrinsic::x86_aesdecwide128kl:

27520

case Intrinsic::x86_aesencwide256kl:

27521

case Intrinsic::x86_aesdecwide256kl: {

27522

SDLoc DL(Op);

27523

SDVTList VTs = DAG.getVTList(

27524

{MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

27525

MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

27526

SDValue Chain = Op.getOperand(0);

27527

unsigned Opcode;

27528

27529

switch (IntNo) {

27530

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27530);

27531

case Intrinsic::x86_aesencwide128kl:

27532

Opcode = X86ISD::AESENCWIDE128KL;

27533

break;

27534

case Intrinsic::x86_aesdecwide128kl:

27535

Opcode = X86ISD::AESDECWIDE128KL;

27536

break;

27537

case Intrinsic::x86_aesencwide256kl:

27538

Opcode = X86ISD::AESENCWIDE256KL;

27539

break;

27540

case Intrinsic::x86_aesdecwide256kl:

27541

Opcode = X86ISD::AESDECWIDE256KL;

27542

break;

27543

}

27544

27545

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

27546

MachineMemOperand *MMO = MemIntr->getMemOperand();

27547

EVT MemVT = MemIntr->getMemoryVT();

27548

SDValue Operation = DAG.getMemIntrinsicNode(

27549

Opcode, DL, VTs,

27550

{Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

27551

Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

27552

Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

27553

MemVT, MMO);

27554

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);

27555

27556

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

27557

{ZF, Operation.getValue(1), Operation.getValue(2),

27558

Operation.getValue(3), Operation.getValue(4),

27559

Operation.getValue(5), Operation.getValue(6),

27560

Operation.getValue(7), Operation.getValue(8),

27561

Operation.getValue(9)});

27562

}

27563

case Intrinsic::x86_testui: {

27564

SDLoc dl(Op);

27565

SDValue Chain = Op.getOperand(0);

27566

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

27567

SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

27568

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

27569

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

27570

Operation.getValue(1));

27571

}

27572

case Intrinsic::x86_atomic_bts:

27573

case Intrinsic::x86_atomic_btc:

27574

case Intrinsic::x86_atomic_btr: {

27575

SDLoc DL(Op);

27576

MVT VT = Op.getSimpleValueType();

27577

SDValue Chain = Op.getOperand(0);

27578

SDValue Op1 = Op.getOperand(2);

27579

SDValue Op2 = Op.getOperand(3);

27580

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS

27581

: IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

27582

: X86ISD::LBTR;

27583

SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

27584

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

27585

SDValue Res =

27586

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

27587

{Chain, Op1, Op2, Size}, VT, MMO);

27588

Chain = Res.getValue(1);

27589

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

27590

unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();

27591

if (Imm)

27592

Res = DAG.getNode(ISD::SHL, DL, VT, Res,

27593

DAG.getShiftAmountConstant(Imm, VT, DL));

27594

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

27595

}

27596

}

27597

return SDValue();

27598

}

27599

27600

SDLoc dl(Op);

27601

switch(IntrData->Type) {

27602

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27602);

27603

case RDSEED:

27604

case RDRAND: {

27605

// Emit the node with the right value type.

27606

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

27607

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

27608

27609

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

27610

// Otherwise return the value from Rand, which is always 0, casted to i32.

27611

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

27612

DAG.getConstant(1, dl, Op->getValueType(1)),

27613

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

27614

SDValue(Result.getNode(), 1)};

27615

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

27616

27617

// Return { result, isValid, chain }.

27618

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

27619

SDValue(Result.getNode(), 2));

27620

}

27621

case GATHER_AVX2: {

27622

SDValue Chain = Op.getOperand(0);

27623

SDValue Src = Op.getOperand(2);

27624

SDValue Base = Op.getOperand(3);

27625

SDValue Index = Op.getOperand(4);

27626

SDValue Mask = Op.getOperand(5);

27627

SDValue Scale = Op.getOperand(6);

27628

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

27629

Scale, Chain, Subtarget);

27630

}

27631

case GATHER: {

27632

//gather(v1, mask, index, base, scale);

27633

SDValue Chain = Op.getOperand(0);

27634

SDValue Src = Op.getOperand(2);

27635

SDValue Base = Op.getOperand(3);

27636

SDValue Index = Op.getOperand(4);

27637

SDValue Mask = Op.getOperand(5);

27638

SDValue Scale = Op.getOperand(6);

27639

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

27640

Chain, Subtarget);

27641

}

27642

case SCATTER: {

27643

//scatter(base, mask, index, v1, scale);

27644

SDValue Chain = Op.getOperand(0);

27645

SDValue Base = Op.getOperand(2);

27646

SDValue Mask = Op.getOperand(3);

27647

SDValue Index = Op.getOperand(4);

27648

SDValue Src = Op.getOperand(5);

27649

SDValue Scale = Op.getOperand(6);

27650

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

27651

Scale, Chain, Subtarget);

27652

}

27653

case PREFETCH: {

27654

const APInt &HintVal = Op.getConstantOperandAPInt(6);

27655

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27656, __extension__
__PRETTY_FUNCTION__))

27656

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27656, __extension__
__PRETTY_FUNCTION__));

27657

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

27658

SDValue Chain = Op.getOperand(0);

27659

SDValue Mask = Op.getOperand(2);

27660

SDValue Index = Op.getOperand(3);

27661

SDValue Base = Op.getOperand(4);

27662

SDValue Scale = Op.getOperand(5);

27663

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

27664

Subtarget);

27665

}

27666

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

27667

case RDTSC: {

27668

SmallVector<SDValue, 2> Results;

27669

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

27670

Results);

27671

return DAG.getMergeValues(Results, dl);

27672

}

27673

// Read Performance Monitoring Counters.

27674

case RDPMC:

27675

// GetExtended Control Register.

27676

case XGETBV: {

27677

SmallVector<SDValue, 2> Results;

27678

27679

// RDPMC uses ECX to select the index of the performance counter to read.

27680

// XGETBV uses ECX to select the index of the XCR register to return.

27681

// The result is stored into registers EDX:EAX.

27682

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

27683

Subtarget, Results);

27684

return DAG.getMergeValues(Results, dl);

27685

}

27686

// XTEST intrinsics.

27687

case XTEST: {

27688

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

27689

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

27690

27691

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

27692

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

27693

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

27694

Ret, SDValue(InTrans.getNode(), 1));

27695

}

27696

case TRUNCATE_TO_MEM_VI8:

27697

case TRUNCATE_TO_MEM_VI16:

27698

case TRUNCATE_TO_MEM_VI32: {

27699

SDValue Mask = Op.getOperand(4);

27700

SDValue DataToTruncate = Op.getOperand(3);

27701

SDValue Addr = Op.getOperand(2);

27702

SDValue Chain = Op.getOperand(0);

27703

27704

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

27705

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27705, __extension__
__PRETTY_FUNCTION__));

27706

27707

EVT MemVT = MemIntr->getMemoryVT();

27708

27709

uint16_t TruncationOp = IntrData->Opc0;

27710

switch (TruncationOp) {

27711

case X86ISD::VTRUNC: {

27712

if (isAllOnesConstant(Mask)) // return just a truncate store

27713

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

27714

MemIntr->getMemOperand());

27715

27716

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

27717

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27718

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

27719

27720

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

27721

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

27722

true /* truncating */);

27723

}

27724

case X86ISD::VTRUNCUS:

27725

case X86ISD::VTRUNCS: {

27726

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

27727

if (isAllOnesConstant(Mask))

27728

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

27729

MemIntr->getMemOperand(), DAG);

27730

27731

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

27732

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27733

27734

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

27735

VMask, MemVT, MemIntr->getMemOperand(), DAG);

27736

}

27737

default:

27738

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27738);

27739

}

27740

}

27741

}

27742

}

27743

27744

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

27745

SelectionDAG &DAG) const {

27746

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

27747

MFI.setReturnAddressIsTaken(true);

27748

27749

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

27750

return SDValue();

27751

27752

unsigned Depth = Op.getConstantOperandVal(0);

27753

SDLoc dl(Op);

27754

EVT PtrVT = getPointerTy(DAG.getDataLayout());

27755

27756

if (Depth > 0) {

27757

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

27758

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27759

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

27760

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

27761

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

27762

MachinePointerInfo());

27763

}

27764

27765

// Just load the return address.

27766

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

27767

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

27768

MachinePointerInfo());

27769

}

27770

27771

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

27772

SelectionDAG &DAG) const {

27773

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

27774

return getReturnAddressFrameIndex(DAG);

27775

}

27776

27777

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

27778

MachineFunction &MF = DAG.getMachineFunction();

27779

MachineFrameInfo &MFI = MF.getFrameInfo();

27780

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

27781

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27782

EVT VT = Op.getValueType();

27783

27784

MFI.setFrameAddressIsTaken(true);

27785

27786

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

27787

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

27788

// is not possible to crawl up the stack without looking at the unwind codes

27789

// simultaneously.

27790

int FrameAddrIndex = FuncInfo->getFAIndex();

27791

if (!FrameAddrIndex) {

27792

// Set up a frame object for the return address.

27793

unsigned SlotSize = RegInfo->getSlotSize();

27794

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

27795

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

27796

FuncInfo->setFAIndex(FrameAddrIndex);

27797

}

27798

return DAG.getFrameIndex(FrameAddrIndex, VT);

27799

}

27800

27801

unsigned FrameReg =

27802

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

27803

SDLoc dl(Op); // FIXME probably not meaningful

27804

unsigned Depth = Op.getConstantOperandVal(0);

27805

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))

27806

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__))

27807

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27807, __extension__
__PRETTY_FUNCTION__));

27808

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

27809

while (Depth--)

27810

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

27811

MachinePointerInfo());

27812

return FrameAddr;

27813

}

27814

27815

// FIXME? Maybe this could be a TableGen attribute on some registers and

27816

// this table could be generated automatically from RegInfo.

27817

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

27818

const MachineFunction &MF) const {

27819

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

27820

27821

Register Reg = StringSwitch<unsigned>(RegName)

27822

.Case("esp", X86::ESP)

27823

.Case("rsp", X86::RSP)

27824

.Case("ebp", X86::EBP)

27825

.Case("rbp", X86::RBP)

27826

.Default(0);

27827

27828

if (Reg == X86::EBP || Reg == X86::RBP) {

27829

if (!TFI.hasFP(MF))

27830

report_fatal_error("register " + StringRef(RegName) +

27831

" is allocatable: function has no frame pointer");

27832

#ifndef NDEBUG

27833

else {

27834

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27835

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

27836

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27837, __extension__
__PRETTY_FUNCTION__))

27837

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27837, __extension__
__PRETTY_FUNCTION__));

27838

}

27839

#endif

27840

}

27841

27842

if (Reg)

27843

return Reg;

27844

27845

report_fatal_error("Invalid register name global variable");

27846

}

27847

27848

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

27849

SelectionDAG &DAG) const {

27850

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27851

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

27852

}

27853

27854

Register X86TargetLowering::getExceptionPointerRegister(

27855

const Constant *PersonalityFn) const {

27856

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

27857

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

27858

27859

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

27860

}

27861

27862

Register X86TargetLowering::getExceptionSelectorRegister(

27863

const Constant *PersonalityFn) const {

27864

// Funclet personalities don't use selectors (the runtime does the selection).

27865

if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

27866

return X86::NoRegister;

27867

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

27868

}

27869

27870

bool X86TargetLowering::needsFixedCatchObjects() const {

27871

return Subtarget.isTargetWin64();

27872

}

27873

27874

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

27875

SDValue Chain = Op.getOperand(0);

27876

SDValue Offset = Op.getOperand(1);

27877

SDValue Handler = Op.getOperand(2);

27878

SDLoc dl (Op);

27879

27880

EVT PtrVT = getPointerTy(DAG.getDataLayout());

27881

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27882

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

27883

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))

27884

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__))

27885

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27885, __extension__
__PRETTY_FUNCTION__));

27886

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

27887

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

27888

27889

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

27890

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

27891

dl));

27892

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

27893

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

27894

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

27895

27896

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

27897

DAG.getRegister(StoreAddrReg, PtrVT));

27898

}

27899

27900

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

27901

SelectionDAG &DAG) const {

27902

SDLoc DL(Op);

27903

// If the subtarget is not 64bit, we may need the global base reg

27904

// after isel expand pseudo, i.e., after CGBR pass ran.

27905

// Therefore, ask for the GlobalBaseReg now, so that the pass

27906

// inserts the code for us in case we need it.

27907

// Otherwise, we will end up in a situation where we will

27908

// reference a virtual register that is not defined!

27909

if (!Subtarget.is64Bit()) {

27910

const X86InstrInfo *TII = Subtarget.getInstrInfo();

27911

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

27912

}

27913

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

27914

DAG.getVTList(MVT::i32, MVT::Other),

27915

Op.getOperand(0), Op.getOperand(1));

27916

}

27917

27918

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

27919

SelectionDAG &DAG) const {

27920

SDLoc DL(Op);

27921

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

27922

Op.getOperand(0), Op.getOperand(1));

27923

}

27924

27925

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

27926

SelectionDAG &DAG) const {

27927

SDLoc DL(Op);

27928

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

27929

Op.getOperand(0));

27930

}

27931

27932

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

27933

return Op.getOperand(0);

27934

}

27935

27936

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

27937

SelectionDAG &DAG) const {

27938

SDValue Root = Op.getOperand(0);

27939

SDValue Trmp = Op.getOperand(1); // trampoline

27940

SDValue FPtr = Op.getOperand(2); // nested function

27941

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

27942

SDLoc dl (Op);

27943

27944

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

27945

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

27946

27947

if (Subtarget.is64Bit()) {

27948

SDValue OutChains[6];

27949

27950

// Large code-model.

27951

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

27952

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

27953

27954

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

27955

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

27956

27957

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

27958

27959

// Load the pointer to the nested function into R11.

27960

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

27961

SDValue Addr = Trmp;

27962

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

27963

Addr, MachinePointerInfo(TrmpAddr));

27964

27965

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

27966

DAG.getConstant(2, dl, MVT::i64));

27967

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

27968

MachinePointerInfo(TrmpAddr, 2), Align(2));

27969

27970

// Load the 'nest' parameter value into R10.

27971

// R10 is specified in X86CallingConv.td

27972

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

27973

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

27974

DAG.getConstant(10, dl, MVT::i64));

27975

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

27976

Addr, MachinePointerInfo(TrmpAddr, 10));

27977

27978

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

27979

DAG.getConstant(12, dl, MVT::i64));

27980

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

27981

MachinePointerInfo(TrmpAddr, 12), Align(2));

27982

27983

// Jump to the nested function.

27984

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

27985

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

27986

DAG.getConstant(20, dl, MVT::i64));

27987

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

27988

Addr, MachinePointerInfo(TrmpAddr, 20));

27989

27990

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

27991

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

27992

DAG.getConstant(22, dl, MVT::i64));

27993

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

27994

Addr, MachinePointerInfo(TrmpAddr, 22));

27995

27996

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

27997

} else {

27998

const Function *Func =

27999

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

28000

CallingConv::ID CC = Func->getCallingConv();

28001

unsigned NestReg;

28002

28003

switch (CC) {

28004

default:

28005

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28005);

28006

case CallingConv::C:

28007

case CallingConv::X86_StdCall: {

28008

// Pass 'nest' parameter in ECX.

28009

// Must be kept in sync with X86CallingConv.td

28010

NestReg = X86::ECX;

28011

28012

// Check that ECX wasn't needed by an 'inreg' parameter.

28013

FunctionType *FTy = Func->getFunctionType();

28014

const AttributeList &Attrs = Func->getAttributes();

28015

28016

if (!Attrs.isEmpty() && !Func->isVarArg()) {

28017

unsigned InRegCount = 0;

28018

unsigned Idx = 0;

28019

28020

for (FunctionType::param_iterator I = FTy->param_begin(),

28021

E = FTy->param_end(); I != E; ++I, ++Idx)

28022

if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

28023

const DataLayout &DL = DAG.getDataLayout();

28024

// FIXME: should only count parameters that are lowered to integers.

28025

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

28026

}

28027

28028

if (InRegCount > 2) {

28029

report_fatal_error("Nest register in use - reduce number of inreg"

28030

" parameters!");

28031

}

28032

}

28033

break;

28034

}

28035

case CallingConv::X86_FastCall:

28036

case CallingConv::X86_ThisCall:

28037

case CallingConv::Fast:

28038

case CallingConv::Tail:

28039

case CallingConv::SwiftTail:

28040

// Pass 'nest' parameter in EAX.

28041

// Must be kept in sync with X86CallingConv.td

28042

NestReg = X86::EAX;

28043

break;

28044

}

28045

28046

SDValue OutChains[4];

28047

SDValue Addr, Disp;

28048

28049

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

28050

DAG.getConstant(10, dl, MVT::i32));

28051

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

28052

28053

// This is storing the opcode for MOV32ri.

28054

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

28055

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

28056

OutChains[0] =

28057

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

28058

Trmp, MachinePointerInfo(TrmpAddr));

28059

28060

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

28061

DAG.getConstant(1, dl, MVT::i32));

28062

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

28063

MachinePointerInfo(TrmpAddr, 1), Align(1));

28064

28065

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

28066

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

28067

DAG.getConstant(5, dl, MVT::i32));

28068

OutChains[2] =

28069

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

28070

MachinePointerInfo(TrmpAddr, 5), Align(1));

28071

28072

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

28073

DAG.getConstant(6, dl, MVT::i32));

28074

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

28075

MachinePointerInfo(TrmpAddr, 6), Align(1));

28076

28077

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

28078

}

28079

}

28080

28081

SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,

28082

SelectionDAG &DAG) const {

28083

/*

28084

The rounding mode is in bits 11:10 of FPSR, and has the following

28085

settings:

28086

00 Round to nearest

28087

01 Round to -inf

28088

10 Round to +inf

28089

11 Round to 0

28090

28091

FLT_ROUNDS, on the other hand, expects the following:

28092

-1 Undefined

28093

0 Round to 0

28094

1 Round to nearest

28095

2 Round to +inf

28096

3 Round to -inf

28097

28098

To perform the conversion, we use a packed lookup table of the four 2-bit

28099

values that we can index by FPSP[11:10]

28100

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

28101

28102

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

28103

*/

28104

28105

MachineFunction &MF = DAG.getMachineFunction();

28106

MVT VT = Op.getSimpleValueType();

28107

SDLoc DL(Op);

28108

28109

// Save FP Control Word to stack slot

28110

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

28111

SDValue StackSlot =

28112

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

28113

28114

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

28115

28116

SDValue Chain = Op.getOperand(0);

28117

SDValue Ops[] = {Chain, StackSlot};

28118

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

28119

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

28120

Align(2), MachineMemOperand::MOStore);

28121

28122

// Load FP Control Word from stack slot

28123

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

28124

Chain = CWD.getValue(1);

28125

28126

// Mask and turn the control bits into a shift for the lookup table.

28127

SDValue Shift =

28128

DAG.getNode(ISD::SRL, DL, MVT::i16,

28129

DAG.getNode(ISD::AND, DL, MVT::i16,

28130

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

28131

DAG.getConstant(9, DL, MVT::i8));

28132

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

28133

28134

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

28135

SDValue RetVal =

28136

DAG.getNode(ISD::AND, DL, MVT::i32,

28137

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

28138

DAG.getConstant(3, DL, MVT::i32));

28139

28140

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

28141

28142

return DAG.getMergeValues({RetVal, Chain}, DL);

28143

}

28144

28145

SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

28146

SelectionDAG &DAG) const {

28147

MachineFunction &MF = DAG.getMachineFunction();

28148

SDLoc DL(Op);

28149

SDValue Chain = Op.getNode()->getOperand(0);

28150

28151

// FP control word may be set only from data in memory. So we need to allocate

28152

// stack space to save/load FP control word.

28153

int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

28154

SDValue StackSlot =

28155

DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

28156

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

28157

MachineMemOperand *MMO =

28158

MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));

28159

28160

// Store FP control word into memory.

28161

SDValue Ops[] = {Chain, StackSlot};

28162

Chain = DAG.getMemIntrinsicNode(

28163

X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);

28164

28165

// Load FP Control Word from stack slot and clear RM field (bits 11:10).

28166

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

28167

Chain = CWD.getValue(1);

28168

CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

28169

DAG.getConstant(0xf3ff, DL, MVT::i16));

28170

28171

// Calculate new rounding mode.

28172

SDValue NewRM = Op.getNode()->getOperand(1);

28173

SDValue RMBits;

28174

if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

28175

uint64_t RM = CVal->getZExtValue();

28176

int FieldVal;

28177

switch (static_cast<RoundingMode>(RM)) {

28178

case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;

28179

case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;

28180

case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;

28181

case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;

28182

default:

28183

llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28183);

28184

}

28185

RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

28186

} else {

28187

// Need to convert argument into bits of control word:

28188

// 0 Round to 0 -> 11

28189

// 1 Round to nearest -> 00

28190

// 2 Round to +inf -> 10

28191

// 3 Round to -inf -> 01

28192

// The 2-bit value needs then to be shifted so that it occupies bits 11:10.

28193

// To make the conversion, put all these values into a value 0xc9 and shift

28194

// it left depending on the rounding mode:

28195

// (0xc9 << 4) & 0xc00 = X86::rmTowardZero

28196

// (0xc9 << 6) & 0xc00 = X86::rmToNearest

28197

// ...

28198

// (0xc9 << (2 * NewRM + 4)) & 0xc00

28199

SDValue ShiftValue =

28200

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

28201

DAG.getNode(ISD::ADD, DL, MVT::i32,

28202

DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

28203

DAG.getConstant(1, DL, MVT::i8)),

28204

DAG.getConstant(4, DL, MVT::i32)));

28205

SDValue Shifted =

28206

DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

28207

ShiftValue);

28208

RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

28209

DAG.getConstant(0xc00, DL, MVT::i16));

28210

}

28211

28212

// Update rounding mode bits and store the new FP Control Word into stack.

28213

CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

28214

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);

28215

28216

// Load FP control word from the slot.

28217

SDValue OpsLD[] = {Chain, StackSlot};

28218

MachineMemOperand *MMOL =

28219

MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

28220

Chain = DAG.getMemIntrinsicNode(

28221

X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);

28222

28223

// If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

28224

// same way but in bits 14:13.

28225

if (Subtarget.hasSSE1()) {

28226

// Store MXCSR into memory.

28227

Chain = DAG.getNode(

28228

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

28229

DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

28230

StackSlot);

28231

28232

// Load MXCSR from stack slot and clear RM field (bits 14:13).

28233

SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

28234

Chain = CWD.getValue(1);

28235

CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

28236

DAG.getConstant(0xffff9fff, DL, MVT::i32));

28237

28238

// Shift X87 RM bits from 11:10 to 14:13.

28239

RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

28240

RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

28241

DAG.getConstant(3, DL, MVT::i8));

28242

28243

// Update rounding mode bits and store the new FP Control Word into stack.

28244

CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

28245

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);

28246

28247

// Load MXCSR from the slot.

28248

Chain = DAG.getNode(

28249

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

28250

DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

28251

StackSlot);

28252

}

28253

28254

return Chain;

28255

}

28256

28257

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

28258

//

28259

// i8/i16 vector implemented using dword LZCNT vector instruction

28260

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

28261

// split the vector, perform operation on it's Lo a Hi part and

28262

// concatenate the results.

28263

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

28264

const X86Subtarget &Subtarget) {

28265

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28265, __extension__ __PRETTY_FUNCTION__));

28266

SDLoc dl(Op);

28267

MVT VT = Op.getSimpleValueType();

28268

MVT EltVT = VT.getVectorElementType();

28269

unsigned NumElems = VT.getVectorNumElements();

28270

28271

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28272, __extension__
__PRETTY_FUNCTION__))

28272

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28272, __extension__
__PRETTY_FUNCTION__));

28273

28274

// Split vector, it's Lo and Hi parts will be handled in next iteration.

28275

if (NumElems > 16 ||

28276

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

28277

return splitVectorIntUnary(Op, DAG);

28278

28279

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

28280

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28281, __extension__
__PRETTY_FUNCTION__))

28281

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28281, __extension__
__PRETTY_FUNCTION__));

28282

28283

// Use native supported vector instruction vplzcntd.

28284

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

28285

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

28286

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

28287

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

28288

28289

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

28290

}

28291

28292

// Lower CTLZ using a PSHUFB lookup table implementation.

28293

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

28294

const X86Subtarget &Subtarget,

28295

SelectionDAG &DAG) {

28296

MVT VT = Op.getSimpleValueType();

28297

int NumElts = VT.getVectorNumElements();

28298

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

28299

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

28300

28301

// Per-nibble leading zero PSHUFB lookup table.

28302

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

28303

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

28304

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

28305

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

28306

28307

SmallVector<SDValue, 64> LUTVec;

28308

for (int i = 0; i < NumBytes; ++i)

28309

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

28310

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

28311

28312

// Begin by bitcasting the input to byte vector, then split those bytes

28313

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

28314

// If the hi input nibble is zero then we add both results together, otherwise

28315

// we just take the hi result (by masking the lo result to zero before the

28316

// add).

28317

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

28318

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

28319

28320

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

28321

SDValue Lo = Op0;

28322

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

28323

SDValue HiZ;

28324

if (CurrVT.is512BitVector()) {

28325

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

28326

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

28327

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

28328

} else {

28329

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

28330

}

28331

28332

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

28333

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

28334

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

28335

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

28336

28337

// Merge result back from vXi8 back to VT, working on the lo/hi halves

28338

// of the current vector width in the same way we did for the nibbles.

28339

// If the upper half of the input element is zero then add the halves'

28340

// leading zero counts together, otherwise just use the upper half's.

28341

// Double the width of the result until we are at target width.

28342

while (CurrVT != VT) {

28343

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

28344

int CurrNumElts = CurrVT.getVectorNumElements();

28345

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

28346

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

28347

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

28348

28349

// Check if the upper half of the input element is zero.

28350

if (CurrVT.is512BitVector()) {

28351

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

28352

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

28353

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

28354

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

28355

} else {

28356

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

28357

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

28358

}

28359

HiZ = DAG.getBitcast(NextVT, HiZ);

28360

28361

// Move the upper/lower halves to the lower bits as we'll be extending to

28362

// NextVT. Mask the lower result to zero if HiZ is true and add the results

28363

// together.

28364

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

28365

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

28366

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

28367

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

28368

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

28369

CurrVT = NextVT;

28370

}

28371

28372

return Res;

28373

}

28374

28375

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

28376

const X86Subtarget &Subtarget,

28377

SelectionDAG &DAG) {

28378

MVT VT = Op.getSimpleValueType();

28379

28380

if (Subtarget.hasCDI() &&

28381

// vXi8 vectors need to be promoted to 512-bits for vXi32.

28382

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

28383

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

28384

28385

// Decompose 256-bit ops into smaller 128-bit ops.

28386

if (VT.is256BitVector() && !Subtarget.hasInt256())

28387

return splitVectorIntUnary(Op, DAG);

28388

28389

// Decompose 512-bit ops into smaller 256-bit ops.

28390

if (VT.is512BitVector() && !Subtarget.hasBWI())

28391

return splitVectorIntUnary(Op, DAG);

28392

28393

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28393, __extension__
__PRETTY_FUNCTION__));

28394

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

28395

}

28396

28397

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

28398

SelectionDAG &DAG) {

28399

MVT VT = Op.getSimpleValueType();

28400

MVT OpVT = VT;

28401

unsigned NumBits = VT.getSizeInBits();

28402

SDLoc dl(Op);

28403

unsigned Opc = Op.getOpcode();

28404

28405

if (VT.isVector())

28406

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

28407

28408

Op = Op.getOperand(0);

28409

if (VT == MVT::i8) {

28410

// Zero extend to i32 since there is not an i8 bsr.

28411

OpVT = MVT::i32;

28412

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

28413

}

28414

28415

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

28416

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

28417

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

28418

28419

if (Opc == ISD::CTLZ) {

28420

// If src is zero (i.e. bsr sets ZF), returns NumBits.

28421

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

28422

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

28423

Op.getValue(1)};

28424

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

28425

}

28426

28427

// Finally xor with NumBits-1.

28428

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

28429

DAG.getConstant(NumBits - 1, dl, OpVT));

28430

28431

if (VT == MVT::i8)

28432

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

28433

return Op;

28434

}

28435

28436

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

28437

SelectionDAG &DAG) {

28438

MVT VT = Op.getSimpleValueType();

28439

unsigned NumBits = VT.getScalarSizeInBits();

28440

SDValue N0 = Op.getOperand(0);

28441

SDLoc dl(Op);

28442

28443

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28444, __extension__
__PRETTY_FUNCTION__))

28444

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28444, __extension__
__PRETTY_FUNCTION__));

28445

28446

// Issue a bsf (scan bits forward) which also sets EFLAGS.

28447

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

28448

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

28449

28450

// If src is zero (i.e. bsf sets ZF), returns NumBits.

28451

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

28452

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

28453

Op.getValue(1)};

28454

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

28455

}

28456

28457

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

28458

const X86Subtarget &Subtarget) {

28459

MVT VT = Op.getSimpleValueType();

28460

if (VT == MVT::i16 || VT == MVT::i32)

28461

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

28462

28463

if (VT == MVT::v32i16 || VT == MVT::v64i8)

28464

return splitVectorIntBinary(Op, DAG);

28465

28466

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))

28467

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__))

28468

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28468, __extension__
__PRETTY_FUNCTION__));

28469

return splitVectorIntBinary(Op, DAG);

28470

}

28471

28472

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

28473

const X86Subtarget &Subtarget) {

28474

MVT VT = Op.getSimpleValueType();

28475

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

28476

unsigned Opcode = Op.getOpcode();

28477

SDLoc DL(Op);

28478

28479

if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

28480

(VT.is256BitVector() && !Subtarget.hasInt256())) {

28481

assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28482, __extension__
__PRETTY_FUNCTION__))

28482

"Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28482, __extension__
__PRETTY_FUNCTION__));

28483

return splitVectorIntBinary(Op, DAG);

28484

}

28485

28486

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

28487

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28488

EVT SetCCResultType =

28489

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

28490

28491

unsigned BitWidth = VT.getScalarSizeInBits();

28492

if (Opcode == ISD::USUBSAT) {

28493

if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

28494

// Handle a special-case with a bit-hack instead of cmp+select:

28495

// usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

28496

// If the target can use VPTERNLOG, DAGToDAG will match this as

28497

// "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

28498

// "broadcast" constant load.

28499

ConstantSDNode *C = isConstOrConstSplat(Y, true);

28500

if (C && C->getAPIntValue().isSignMask()) {

28501

SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

28502

SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

28503

SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

28504

SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

28505

return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

28506

}

28507

}

28508

if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

28509

// usubsat X, Y --> (X >u Y) ? X - Y : 0

28510

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

28511

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

28512

// TODO: Move this to DAGCombiner?

28513

if (SetCCResultType == VT &&

28514

DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

28515

return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

28516

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

28517

}

28518

}

28519

28520

if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

28521

(!VT.isVector() || VT == MVT::v2i64)) {

28522

APInt MinVal = APInt::getSignedMinValue(BitWidth);

28523

APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

28524

SDValue Zero = DAG.getConstant(0, DL, VT);

28525

SDValue Result =

28526

DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

28527

DAG.getVTList(VT, SetCCResultType), X, Y);

28528

SDValue SumDiff = Result.getValue(0);

28529

SDValue Overflow = Result.getValue(1);

28530

SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

28531

SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

28532

SDValue SumNeg =

28533

DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

28534

Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

28535

return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

28536

}

28537

28538

// Use default expansion.

28539

return SDValue();

28540

}

28541

28542

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

28543

SelectionDAG &DAG) {

28544

MVT VT = Op.getSimpleValueType();

28545

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

28546

// Since X86 does not have CMOV for 8-bit integer, we don't convert

28547

// 8-bit integer abs to NEG and CMOV.

28548

SDLoc DL(Op);

28549

SDValue N0 = Op.getOperand(0);

28550

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

28551

DAG.getConstant(0, DL, VT), N0);

28552

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

28553

SDValue(Neg.getNode(), 1)};

28554

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

28555

}

28556

28557

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

28558

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

28559

SDLoc DL(Op);

28560

SDValue Src = Op.getOperand(0);

28561

SDValue Sub =

28562

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

28563

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

28564

}

28565

28566

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

28567

assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28568, __extension__
__PRETTY_FUNCTION__))

28568

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28568, __extension__
__PRETTY_FUNCTION__));

28569

return splitVectorIntUnary(Op, DAG);

28570

}

28571

28572

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

28573

return splitVectorIntUnary(Op, DAG);

28574

28575

// Default to expand.

28576

return SDValue();

28577

}

28578

28579

static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

28580

SelectionDAG &DAG) {

28581

MVT VT = Op.getSimpleValueType();

28582

28583

// For AVX1 cases, split to use legal ops (everything but v4i64).

28584

if (VT.is256BitVector() && !Subtarget.hasInt256())

28585

return splitVectorIntBinary(Op, DAG);

28586

28587

if (VT == MVT::v32i16 || VT == MVT::v64i8)

28588

return splitVectorIntBinary(Op, DAG);

28589

28590

// Default to expand.

28591

return SDValue();

28592

}

28593

28594

static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {

28595

MVT VT = Op.getSimpleValueType();

28596

28597

// For AVX1 cases, split to use legal ops (everything but v4i64).

28598

if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())

28599

return splitVectorIntBinary(Op, DAG);

28600

28601

if (VT == MVT::v32i16 || VT == MVT::v64i8)

28602

return splitVectorIntBinary(Op, DAG);

28603

28604

// Default to expand.

28605

return SDValue();

28606

}

28607

28608

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

28609

SelectionDAG &DAG) {

28610

SDLoc dl(Op);

28611

MVT VT = Op.getSimpleValueType();

28612

28613

// Decompose 256-bit ops into 128-bit ops.

28614

if (VT.is256BitVector() && !Subtarget.hasInt256())

28615

return splitVectorIntBinary(Op, DAG);

28616

28617

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

28618

return splitVectorIntBinary(Op, DAG);

28619

28620

SDValue A = Op.getOperand(0);

28621

SDValue B = Op.getOperand(1);

28622

28623

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

28624

// vector pairs, multiply and truncate.

28625

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

28626

unsigned NumElts = VT.getVectorNumElements();

28627

28628

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

28629

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

28630

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

28631

return DAG.getNode(

28632

ISD::TRUNCATE, dl, VT,

28633

DAG.getNode(ISD::MUL, dl, ExVT,

28634

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

28635

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

28636

}

28637

28638

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

28639

28640

// Extract the lo/hi parts to any extend to i16.

28641

// We're going to mask off the low byte of each result element of the

28642

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

28643

// element.

28644

SDValue Undef = DAG.getUNDEF(VT);

28645

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

28646

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

28647

28648

SDValue BLo, BHi;

28649

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

28650

// If the RHS is a constant, manually unpackl/unpackh.

28651

SmallVector<SDValue, 16> LoOps, HiOps;

28652

for (unsigned i = 0; i != NumElts; i += 16) {

28653

for (unsigned j = 0; j != 8; ++j) {

28654

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

28655

MVT::i16));

28656

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

28657

MVT::i16));

28658

}

28659

}

28660

28661

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

28662

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

28663

} else {

28664

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

28665

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

28666

}

28667

28668

// Multiply, mask the lower 8bits of the lo/hi results and pack.

28669

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

28670

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

28671

return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

28672

}

28673

28674

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

28675

if (VT == MVT::v4i32) {

28676

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28677, __extension__
__PRETTY_FUNCTION__))

28677

"Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28677, __extension__
__PRETTY_FUNCTION__));

28678

28679

// Extract the odd parts.

28680

static const int UnpackMask[] = { 1, -1, 3, -1 };

28681

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

28682

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

28683

28684

// Multiply the even parts.

28685

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

28686

DAG.getBitcast(MVT::v2i64, A),

28687

DAG.getBitcast(MVT::v2i64, B));

28688

// Now multiply odd parts.

28689

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

28690

DAG.getBitcast(MVT::v2i64, Aodds),

28691

DAG.getBitcast(MVT::v2i64, Bodds));

28692

28693

Evens = DAG.getBitcast(VT, Evens);

28694

Odds = DAG.getBitcast(VT, Odds);

28695

28696

// Merge the two vectors back together with a shuffle. This expands into 2

28697

// shuffles.

28698

static const int ShufMask[] = { 0, 4, 2, 6 };

28699

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

28700

}

28701

28702

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28703, __extension__
__PRETTY_FUNCTION__))

28703

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28703, __extension__
__PRETTY_FUNCTION__));

28704

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28704, __extension__
__PRETTY_FUNCTION__));

28705

28706

// Ahi = psrlqi(a, 32);

28707

// Bhi = psrlqi(b, 32);

28708

//

28709

// AloBlo = pmuludq(a, b);

28710

// AloBhi = pmuludq(a, Bhi);

28711

// AhiBlo = pmuludq(Ahi, b);

28712

//

28713

// Hi = psllqi(AloBhi + AhiBlo, 32);

28714

// return AloBlo + Hi;

28715

KnownBits AKnown = DAG.computeKnownBits(A);

28716

KnownBits BKnown = DAG.computeKnownBits(B);

28717

28718

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

28719

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

28720

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

28721

28722

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

28723

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

28724

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

28725

28726

SDValue Zero = DAG.getConstant(0, dl, VT);

28727

28728

// Only multiply lo/hi halves that aren't known to be zero.

28729

SDValue AloBlo = Zero;

28730

if (!ALoIsZero && !BLoIsZero)

28731

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

28732

28733

SDValue AloBhi = Zero;

28734

if (!ALoIsZero && !BHiIsZero) {

28735

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

28736

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

28737

}

28738

28739

SDValue AhiBlo = Zero;

28740

if (!AHiIsZero && !BLoIsZero) {

28741

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

28742

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

28743

}

28744

28745

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

28746

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

28747

28748

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

28749

}

28750

28751

static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

28752

MVT VT, bool IsSigned,

28753

const X86Subtarget &Subtarget,

28754

SelectionDAG &DAG,

28755

SDValue *Low = nullptr) {

28756

unsigned NumElts = VT.getVectorNumElements();

28757

28758

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

28759

// to a vXi16 type. Do the multiplies, shift the results and pack the half

28760

// lane results back together.

28761

28762

// We'll take different approaches for signed and unsigned.

28763

// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes

28764

// and use pmullw to calculate the full 16-bit product.

28765

// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

28766

// shift them left into the upper byte of each word. This allows us to use

28767

// pmulhw to calculate the full 16-bit product. This trick means we don't

28768

// need to sign extend the bytes to use pmullw.

28769

28770

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

28771

SDValue Zero = DAG.getConstant(0, dl, VT);

28772

28773

SDValue ALo, AHi;

28774

if (IsSigned) {

28775

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

28776

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

28777

} else {

28778

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

28779

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

28780

}

28781

28782

SDValue BLo, BHi;

28783

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

28784

// If the RHS is a constant, manually unpackl/unpackh and extend.

28785

SmallVector<SDValue, 16> LoOps, HiOps;

28786

for (unsigned i = 0; i != NumElts; i += 16) {

28787

for (unsigned j = 0; j != 8; ++j) {

28788

SDValue LoOp = B.getOperand(i + j);

28789

SDValue HiOp = B.getOperand(i + j + 8);

28790

28791

if (IsSigned) {

28792

LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);

28793

HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);

28794

LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,

28795

DAG.getConstant(8, dl, MVT::i16));

28796

HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,

28797

DAG.getConstant(8, dl, MVT::i16));

28798

} else {

28799

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

28800

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

28801

}

28802

28803

LoOps.push_back(LoOp);

28804

HiOps.push_back(HiOp);

28805

}

28806

}

28807

28808

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

28809

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

28810

} else if (IsSigned) {

28811

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

28812

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

28813

} else {

28814

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

28815

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

28816

}

28817

28818

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

28819

// pack back to vXi8.

28820

unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

28821

SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

28822

SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);

28823

28824

if (Low)

28825

*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

28826

28827

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);

28828

}

28829

28830

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

28831

SelectionDAG &DAG) {

28832

SDLoc dl(Op);

28833

MVT VT = Op.getSimpleValueType();

28834

bool IsSigned = Op->getOpcode() == ISD::MULHS;

28835

unsigned NumElts = VT.getVectorNumElements();

28836

SDValue A = Op.getOperand(0);

28837

SDValue B = Op.getOperand(1);

28838

28839

// Decompose 256-bit ops into 128-bit ops.

28840

if (VT.is256BitVector() && !Subtarget.hasInt256())

28841

return splitVectorIntBinary(Op, DAG);

28842

28843

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

28844

return splitVectorIntBinary(Op, DAG);

28845

28846

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

28847

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))

28848

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__))

28849

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28849, __extension__
__PRETTY_FUNCTION__));

28850

28851

// PMULxD operations multiply each even value (starting at 0) of LHS with

28852

// the related value of RHS and produce a widen result.

28853

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

28854

// => <2 x i64> <ae|cg>

28855

//

28856

// In other word, to have all the results, we need to perform two PMULxD:

28857

// 1. one with the even values.

28858

// 2. one with the odd values.

28859

// To achieve #2, with need to place the odd values at an even position.

28860

//

28861

// Place the odd value at an even position (basically, shift all values 1

28862

// step to the left):

28863

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

28864

9, -1, 11, -1, 13, -1, 15, -1};

28865

// <a|b|c|d> => <b|undef|d|undef>

28866

SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,

28867

makeArrayRef(&Mask[0], NumElts));

28868

// <e|f|g|h> => <f|undef|h|undef>

28869

SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,

28870

makeArrayRef(&Mask[0], NumElts));

28871

28872

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

28873

// ints.

28874

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

28875

unsigned Opcode =

28876

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

28877

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

28878

// => <2 x i64> <ae|cg>

28879

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

28880

DAG.getBitcast(MulVT, A),

28881

DAG.getBitcast(MulVT, B)));

28882

28883

// => <2 x i64> <bf|dh>

28884

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

28885

DAG.getBitcast(MulVT, Odd0),

28886

DAG.getBitcast(MulVT, Odd1)));

28887

28888

// Shuffle it back into the right order.

28889

SmallVector<int, 16> ShufMask(NumElts);

28890

for (int i = 0; i != (int)NumElts; ++i)

28891

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

28892

28893

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

28894

28895

// If we have a signed multiply but no PMULDQ fix up the result of an

28896

// unsigned multiply.

28897

if (IsSigned && !Subtarget.hasSSE41()) {

28898

SDValue Zero = DAG.getConstant(0, dl, VT);

28899

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

28900

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

28901

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

28902

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

28903

28904

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

28905

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

28906

}

28907

28908

return Res;

28909

}

28910

28911

// Only i8 vectors should need custom lowering after this.

28912

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))

28913

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__))

28914

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28914, __extension__
__PRETTY_FUNCTION__));

28915

28916

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

28917

// logical shift down the upper half and pack back to i8.

28918

28919

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

28920

// and then ashr/lshr the upper bits down to the lower bits before multiply.

28921

28922

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

28923

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

28924

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

28925

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

28926

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

28927

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

28928

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

28929

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

28930

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

28931

}

28932

28933

return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

28934

}

28935

28936

// Custom lowering for SMULO/UMULO.

28937

static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

28938

SelectionDAG &DAG) {

28939

MVT VT = Op.getSimpleValueType();

28940

28941

// Scalars defer to LowerXALUO.

28942

if (!VT.isVector())

28943

return LowerXALUO(Op, DAG);

28944

28945

SDLoc dl(Op);

28946

bool IsSigned = Op->getOpcode() == ISD::SMULO;

28947

SDValue A = Op.getOperand(0);

28948

SDValue B = Op.getOperand(1);

28949

EVT OvfVT = Op->getValueType(1);

28950

28951

if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

28952

(VT == MVT::v64i8 && !Subtarget.hasBWI())) {

28953

// Extract the LHS Lo/Hi vectors

28954

SDValue LHSLo, LHSHi;

28955

std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);

28956

28957

// Extract the RHS Lo/Hi vectors

28958

SDValue RHSLo, RHSHi;

28959

std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);

28960

28961

EVT LoOvfVT, HiOvfVT;

28962

std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

28963

SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

28964

SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);

28965

28966

// Issue the split operations.

28967

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

28968

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);

28969

28970

// Join the separate data results and the overflow results.

28971

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

28972

SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

28973

Hi.getValue(1));

28974

28975

return DAG.getMergeValues({Res, Ovf}, dl);

28976

}

28977

28978

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28979

EVT SetccVT =

28980

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

28981

28982

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

28983

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

28984

unsigned NumElts = VT.getVectorNumElements();

28985

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

28986

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

28987

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

28988

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

28989

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

28990

28991

SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

28992

28993

SDValue Ovf;

28994

if (IsSigned) {

28995

SDValue High, LowSign;

28996

if (OvfVT.getVectorElementType() == MVT::i1 &&

28997

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

28998

// Rather the truncating try to do the compare on vXi16 or vXi32.

28999

// Shift the high down filling with sign bits.

29000

High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

29001

// Fill all 16 bits with the sign bit from the low.

29002

LowSign =

29003

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

29004

LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

29005

15, DAG);

29006

SetccVT = OvfVT;

29007

if (!Subtarget.hasBWI()) {

29008

// We can't do a vXi16 compare so sign extend to v16i32.

29009

High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

29010

LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

29011

}

29012

} else {

29013

// Otherwise do the compare at vXi8.

29014

High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

29015

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

29016

LowSign =

29017

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

29018

}

29019

29020

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

29021

} else {

29022

SDValue High =

29023

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

29024

if (OvfVT.getVectorElementType() == MVT::i1 &&

29025

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

29026

// Rather the truncating try to do the compare on vXi16 or vXi32.

29027

SetccVT = OvfVT;

29028

if (!Subtarget.hasBWI()) {

29029

// We can't do a vXi16 compare so sign extend to v16i32.

29030

High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

29031

}

29032

} else {

29033

// Otherwise do the compare at vXi8.

29034

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

29035

}

29036

29037

Ovf =

29038

DAG.getSetCC(dl, SetccVT, High,

29039

DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

29040

}

29041

29042

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

29043

29044

return DAG.getMergeValues({Low, Ovf}, dl);

29045

}

29046

29047

SDValue Low;

29048

SDValue High =

29049

LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);

29050

29051

SDValue Ovf;

29052

if (IsSigned) {

29053

// SMULO overflows if the high bits don't match the sign of the low.

29054

SDValue LowSign =

29055

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

29056

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

29057

} else {

29058

// UMULO overflows if the high bits are non-zero.

29059

Ovf =

29060

DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

29061

}

29062

29063

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

29064

29065

return DAG.getMergeValues({Low, Ovf}, dl);

29066

}

29067

29068

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

29069

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29069, __extension__
__PRETTY_FUNCTION__));

29070

EVT VT = Op.getValueType();

29071

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29072, __extension__
__PRETTY_FUNCTION__))

29072

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29072, __extension__
__PRETTY_FUNCTION__));

29073

29074

RTLIB::Libcall LC;

29075

bool isSigned;

29076

switch (Op->getOpcode()) {

29077

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29077);

29078

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

29079

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

29080

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

29081

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

29082

}

29083

29084

SDLoc dl(Op);

29085

SDValue InChain = DAG.getEntryNode();

29086

29087

TargetLowering::ArgListTy Args;

29088

TargetLowering::ArgListEntry Entry;

29089

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

29090

EVT ArgVT = Op->getOperand(i).getValueType();

29091

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29092, __extension__
__PRETTY_FUNCTION__))

29092

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29092, __extension__
__PRETTY_FUNCTION__));

29093

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

29094

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

29095

MachinePointerInfo MPI =

29096

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

29097

Entry.Node = StackPtr;

29098

InChain =

29099

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

29100

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

29101

Entry.Ty = PointerType::get(ArgTy,0);

29102

Entry.IsSExt = false;

29103

Entry.IsZExt = false;

29104

Args.push_back(Entry);

29105

}

29106

29107

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

29108

getPointerTy(DAG.getDataLayout()));

29109

29110

TargetLowering::CallLoweringInfo CLI(DAG);

29111

CLI.setDebugLoc(dl)

29112

.setChain(InChain)

29113

.setLibCallee(

29114

getLibcallCallingConv(LC),

29115

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

29116

std::move(Args))

29117

.setInRegister()

29118

.setSExtResult(isSigned)

29119

.setZExtResult(!isSigned);

29120

29121

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

29122

return DAG.getBitcast(VT, CallInfo.first);

29123

}

29124

29125

SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

29126

SelectionDAG &DAG,

29127

SDValue &Chain) const {

29128

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29128, __extension__
__PRETTY_FUNCTION__));

29129

EVT VT = Op.getValueType();

29130

bool IsStrict = Op->isStrictFPOpcode();

29131

29132

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

29133

EVT ArgVT = Arg.getValueType();

29134

29135

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29136, __extension__
__PRETTY_FUNCTION__))

29136

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29136, __extension__
__PRETTY_FUNCTION__));

29137

29138

RTLIB::Libcall LC;

29139

if (Op->getOpcode() == ISD::FP_TO_SINT ||

29140

Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

29141

LC = RTLIB::getFPTOSINT(ArgVT, VT);

29142

else

29143

LC = RTLIB::getFPTOUINT(ArgVT, VT);

29144

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29144, __extension__
__PRETTY_FUNCTION__));

29145

29146

SDLoc dl(Op);

29147

MakeLibCallOptions CallOptions;

29148

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

29149

29150

SDValue Result;

29151

// Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

29152

// expected VT (i128).

29153

std::tie(Result, Chain) =

29154

makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

29155

Result = DAG.getBitcast(VT, Result);

29156

return Result;

29157

}

29158

29159

SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

29160

SelectionDAG &DAG) const {

29161

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29161, __extension__
__PRETTY_FUNCTION__));

29162

EVT VT = Op.getValueType();

29163

bool IsStrict = Op->isStrictFPOpcode();

29164

29165

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

29166

EVT ArgVT = Arg.getValueType();

29167

29168

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29169, __extension__
__PRETTY_FUNCTION__))

29169

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29169, __extension__
__PRETTY_FUNCTION__));

29170

29171

RTLIB::Libcall LC;

29172

if (Op->getOpcode() == ISD::SINT_TO_FP ||

29173

Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

29174

LC = RTLIB::getSINTTOFP(ArgVT, VT);

29175

else

29176

LC = RTLIB::getUINTTOFP(ArgVT, VT);

29177

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29177, __extension__
__PRETTY_FUNCTION__));

29178

29179

SDLoc dl(Op);

29180

MakeLibCallOptions CallOptions;

29181

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

29182

29183

// Pass the i128 argument as an indirect argument on the stack.

29184

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

29185

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

29186

MachinePointerInfo MPI =

29187

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

29188

Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));

29189

29190

SDValue Result;

29191

std::tie(Result, Chain) =

29192

makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

29193

return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

29194

}

29195

29196

// Return true if the required (according to Opcode) shift-imm form is natively

29197

// supported by the Subtarget

29198

static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

29199

unsigned Opcode) {

29200

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

29201

return false;

29202

29203

if (VT.getScalarSizeInBits() < 16)

29204

return false;

29205

29206

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

29207

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

29208

return true;

29209

29210

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

29211

(VT.is256BitVector() && Subtarget.hasInt256());

29212

29213

bool AShift = LShift && (Subtarget.hasAVX512() ||

29214

(VT != MVT::v2i64 && VT != MVT::v4i64));

29215

return (Opcode == ISD::SRA) ? AShift : LShift;

29216

}

29217

29218

// The shift amount is a variable, but it is the same for all vector lanes.

29219

// These instructions are defined together with shift-immediate.

29220

static

29221

bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

29222

unsigned Opcode) {

29223

return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

29224

}

29225

29226

// Return true if the required (according to Opcode) variable-shift form is

29227

// natively supported by the Subtarget

29228

static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

29229

unsigned Opcode) {

29230

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

29231

return false;

29232

29233

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

29234

return false;

29235

29236

// vXi16 supported only on AVX-512, BWI

29237

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

29238

return false;

29239

29240

if (Subtarget.hasAVX512() &&

29241

(Subtarget.useAVX512Regs() || !VT.is512BitVector()))

29242

return true;

29243

29244

bool LShift = VT.is128BitVector() || VT.is256BitVector();

29245

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

29246

return (Opcode == ISD::SRA) ? AShift : LShift;

29247

}

29248

29249

static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

29250

const X86Subtarget &Subtarget) {

29251

MVT VT = Op.getSimpleValueType();

29252

SDLoc dl(Op);

29253

SDValue R = Op.getOperand(0);

29254

SDValue Amt = Op.getOperand(1);

29255

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

29256

29257

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

29258

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29258, __extension__
__PRETTY_FUNCTION__));

29259

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

29260

SDValue Ex = DAG.getBitcast(ExVT, R);

29261

29262

// ashr(R, 63) === cmp_slt(R, 0)

29263

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

29264

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29265, __extension__
__PRETTY_FUNCTION__))

29265

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29265, __extension__
__PRETTY_FUNCTION__));

29266

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

29267

}

29268

29269

if (ShiftAmt >= 32) {

29270

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

29271

SDValue Upper =

29272

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

29273

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

29274

ShiftAmt - 32, DAG);

29275

if (VT == MVT::v2i64)

29276

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

29277

if (VT == MVT::v4i64)

29278

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

29279

{9, 1, 11, 3, 13, 5, 15, 7});

29280

} else {

29281

// SRA upper i32, SRL whole i64 and select lower i32.

29282

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

29283

ShiftAmt, DAG);

29284

SDValue Lower =

29285

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

29286

Lower = DAG.getBitcast(ExVT, Lower);

29287

if (VT == MVT::v2i64)

29288

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

29289

if (VT == MVT::v4i64)

29290

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

29291

{8, 1, 10, 3, 12, 5, 14, 7});

29292

}

29293

return DAG.getBitcast(VT, Ex);

29294

};

29295

29296

// Optimize shl/srl/sra with constant shift amount.

29297

APInt APIntShiftAmt;

29298

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

29299

return SDValue();

29300

29301

// If the shift amount is out of range, return undef.

29302

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

29303

return DAG.getUNDEF(VT);

29304

29305

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

29306

29307

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))

29308

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

29309

29310

// i64 SRA needs to be performed as partial shifts.

29311

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

29312

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

29313

Op.getOpcode() == ISD::SRA)

29314

return ArithmeticShiftRight64(ShiftAmt);

29315

29316

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

29317

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

29318

unsigned NumElts = VT.getVectorNumElements();

29319

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

29320

29321

// Simple i8 add case

29322

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

29323

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

29324

// must be 0). (add undef, undef) however can be any value. To make this

29325

// safe, we must freeze R to ensure that register allocation uses the same

29326

// register for an undefined value. This ensures that the result will

29327

// still be even and preserves the original semantics.

29328

R = DAG.getNode(ISD::FREEZE, dl, VT, R);

29329

return DAG.getNode(ISD::ADD, dl, VT, R, R);

29330

}

29331

29332

// ashr(R, 7) === cmp_slt(R, 0)

29333

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

29334

SDValue Zeros = DAG.getConstant(0, dl, VT);

29335

if (VT.is512BitVector()) {

29336

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29336, __extension__
__PRETTY_FUNCTION__));

29337

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

29338

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

29339

}

29340

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

29341

}

29342

29343

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

29344

if (VT == MVT::v16i8 && Subtarget.hasXOP())

29345

return SDValue();

29346

29347

if (Op.getOpcode() == ISD::SHL) {

29348

// Make a large shift.

29349

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

29350

ShiftAmt, DAG);

29351

SHL = DAG.getBitcast(VT, SHL);

29352

// Zero out the rightmost bits.

29353

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

29354

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

29355

}

29356

if (Op.getOpcode() == ISD::SRL) {

29357

// Make a large shift.

29358

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

29359

ShiftAmt, DAG);

29360

SRL = DAG.getBitcast(VT, SRL);

29361

// Zero out the leftmost bits.

29362

APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

29363

return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

29364

}

29365

if (Op.getOpcode() == ISD::SRA) {

29366

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

29367

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

29368

29369

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

29370

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

29371

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

29372

return Res;

29373

}

29374

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29374);

29375

}

29376

29377

return SDValue();

29378

}

29379

29380

static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

29381

const X86Subtarget &Subtarget) {

29382

MVT VT = Op.getSimpleValueType();

29383

SDLoc dl(Op);

29384

SDValue R = Op.getOperand(0);

29385

SDValue Amt = Op.getOperand(1);

29386

unsigned Opcode = Op.getOpcode();

29387

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

29388

29389

int BaseShAmtIdx = -1;

29390

if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

29391

if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

29392

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

29393

Subtarget, DAG);

29394

29395

// vXi8 shifts - shift as v8i16 + mask result.

29396

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

29397

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

29398

VT == MVT::v64i8) &&

29399

!Subtarget.hasXOP()) {

29400

unsigned NumElts = VT.getVectorNumElements();

29401

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

29402

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

29403

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

29404

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

29405

29406

// Create the mask using vXi16 shifts. For shift-rights we need to move

29407

// the upper byte down before splatting the vXi8 mask.

29408

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

29409

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

29410

BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

29411

if (Opcode != ISD::SHL)

29412

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

29413

8, DAG);

29414

BitMask = DAG.getBitcast(VT, BitMask);

29415

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

29416

SmallVector<int, 64>(NumElts, 0));

29417

29418

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

29419

DAG.getBitcast(ExtVT, R), BaseShAmt,

29420

BaseShAmtIdx, Subtarget, DAG);

29421

Res = DAG.getBitcast(VT, Res);

29422

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

29423

29424

if (Opcode == ISD::SRA) {

29425

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

29426

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

29427

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

29428

SignMask =

29429

getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

29430

BaseShAmtIdx, Subtarget, DAG);

29431

SignMask = DAG.getBitcast(VT, SignMask);

29432

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

29433

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

29434

}

29435

return Res;

29436

}

29437

}

29438

}

29439

29440

return SDValue();

29441

}

29442

29443

// Convert a shift/rotate left amount to a multiplication scale factor.

29444

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

29445

const X86Subtarget &Subtarget,

29446

SelectionDAG &DAG) {

29447

MVT VT = Amt.getSimpleValueType();

29448

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

29449

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

29450

(Subtarget.hasAVX512() && VT == MVT::v32i16) ||

29451

(!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

29452

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

29453

(Subtarget.hasBWI() && VT == MVT::v64i8)))

29454

return SDValue();

29455

29456

MVT SVT = VT.getVectorElementType();

29457

unsigned SVTBits = SVT.getSizeInBits();

29458

unsigned NumElems = VT.getVectorNumElements();

29459

29460

APInt UndefElts;

29461

SmallVector<APInt> EltBits;

29462

if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

29463

APInt One(SVTBits, 1);

29464

SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

29465

for (unsigned I = 0; I != NumElems; ++I) {

29466

if (UndefElts[I] || EltBits[I].uge(SVTBits))

29467

continue;

29468

uint64_t ShAmt = EltBits[I].getZExtValue();

29469

Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

29470

}

29471

return DAG.getBuildVector(VT, dl, Elts);

29472

}

29473

29474

// If the target doesn't support variable shifts, use either FP conversion

29475

// or integer multiplication to avoid shifting each element individually.

29476

if (VT == MVT::v4i32) {

29477

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

29478

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

29479

DAG.getConstant(0x3f800000U, dl, VT));

29480

Amt = DAG.getBitcast(MVT::v4f32, Amt);

29481

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

29482

}

29483

29484

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

29485

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

29486

SDValue Z = DAG.getConstant(0, dl, VT);

29487

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

29488

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

29489

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

29490

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

29491

if (Subtarget.hasSSE41())

29492

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

29493

return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

29494

}

29495

29496

return SDValue();

29497

}

29498

29499

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

29500

SelectionDAG &DAG) {

29501

MVT VT = Op.getSimpleValueType();

29502

SDLoc dl(Op);

29503

SDValue R = Op.getOperand(0);

29504

SDValue Amt = Op.getOperand(1);

29505

unsigned EltSizeInBits = VT.getScalarSizeInBits();

29506

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

29507

29508

unsigned Opc = Op.getOpcode();

29509

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

29510

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

29511

29512

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29512, __extension__
__PRETTY_FUNCTION__));

29513

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29513, __extension__
__PRETTY_FUNCTION__));

29514

29515

if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

29516

return V;

29517

29518

if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

29519

return V;

29520

29521

if (supportedVectorVarShift(VT, Subtarget, Opc))

29522

return Op;

29523

29524

// i64 vector arithmetic shift can be emulated with the transform:

29525

// M = lshr(SIGN_MASK, Amt)

29526

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

29527

if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

29528

(VT == MVT::v4i64 && Subtarget.hasInt256())) &&

29529

Opc == ISD::SRA) {

29530

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

29531

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

29532

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

29533

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

29534

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

29535

return R;

29536

}

29537

29538

// XOP has 128-bit variable logical/arithmetic shifts.

29539

// +ve/-ve Amt = shift left/right.

29540

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

29541

VT == MVT::v8i16 || VT == MVT::v16i8)) {

29542

if (Opc == ISD::SRL || Opc == ISD::SRA) {

29543

SDValue Zero = DAG.getConstant(0, dl, VT);

29544

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

29545

}

29546

if (Opc == ISD::SHL || Opc == ISD::SRL)

29547

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

29548

if (Opc == ISD::SRA)

29549

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

29550

}

29551

29552

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

29553

// shifts per-lane and then shuffle the partial results back together.

29554

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

29555

// Splat the shift amounts so the scalar shifts above will catch it.

29556

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

29557

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

29558

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

29559

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

29560

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

29561

}

29562

29563

// If possible, lower this shift as a sequence of two shifts by

29564

// constant plus a BLENDing shuffle instead of scalarizing it.

29565

// Example:

29566

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

29567

//

29568

// Could be rewritten as:

29569

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

29570

//

29571

// The advantage is that the two shifts from the example would be

29572

// lowered as X86ISD::VSRLI nodes in parallel before blending.

29573

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

29574

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

29575

SDValue Amt1, Amt2;

29576

unsigned NumElts = VT.getVectorNumElements();

29577

SmallVector<int, 8> ShuffleMask;

29578

for (unsigned i = 0; i != NumElts; ++i) {

29579

SDValue A = Amt->getOperand(i);

29580

if (A.isUndef()) {

29581

ShuffleMask.push_back(SM_SentinelUndef);

29582

continue;

29583

}

29584

if (!Amt1 || Amt1 == A) {

29585

ShuffleMask.push_back(i);

29586

Amt1 = A;

29587

continue;

29588

}

29589

if (!Amt2 || Amt2 == A) {

29590

ShuffleMask.push_back(i + NumElts);

29591

Amt2 = A;

29592

continue;

29593

}

29594

break;

29595

}

29596

29597

// Only perform this blend if we can perform it without loading a mask.

29598

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

29599

(VT != MVT::v16i16 ||

29600

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

29601

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

29602

canWidenShuffleElements(ShuffleMask))) {

29603

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

29604

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

29605

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

29606

Cst2->getAPIntValue().ult(EltSizeInBits)) {

29607

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

29608

Cst1->getZExtValue(), DAG);

29609

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

29610

Cst2->getZExtValue(), DAG);

29611

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

29612

}

29613

}

29614

}

29615

29616

// If possible, lower this packed shift into a vector multiply instead of

29617

// expanding it into a sequence of scalar shifts.

29618

// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

29619

if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

29620

Subtarget.canExtendTo512BW())))

29621

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

29622

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

29623

29624

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

29625

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

29626

if (Opc == ISD::SRL && ConstantAmt &&

29627

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

29628

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

29629

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

29630

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

29631

SDValue Zero = DAG.getConstant(0, dl, VT);

29632

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

29633

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

29634

return DAG.getSelect(dl, VT, ZAmt, R, Res);

29635

}

29636

}

29637

29638

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

29639

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

29640

// TODO: Special case handling for shift by 0/1, really we can afford either

29641

// of these cases in pre-SSE41/XOP/AVX512 but not both.

29642

if (Opc == ISD::SRA && ConstantAmt &&

29643

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

29644

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

29645

!Subtarget.hasAVX512()) ||

29646

DAG.isKnownNeverZero(Amt))) {

29647

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

29648

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

29649

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

29650

SDValue Amt0 =

29651

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

29652

SDValue Amt1 =

29653

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

29654

SDValue Sra1 =

29655

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

29656

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

29657

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

29658

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

29659

}

29660

}

29661

29662

// v4i32 Non Uniform Shifts.

29663

// If the shift amount is constant we can shift each lane using the SSE2

29664

// immediate shifts, else we need to zero-extend each lane to the lower i64

29665

// and shift using the SSE2 variable shifts.

29666

// The separate results can then be blended together.

29667

if (VT == MVT::v4i32) {

29668

SDValue Amt0, Amt1, Amt2, Amt3;

29669

if (ConstantAmt) {

29670

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

29671

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

29672

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

29673

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

29674

} else {

29675

// The SSE2 shifts use the lower i64 as the same shift amount for

29676

// all lanes and the upper i64 is ignored. On AVX we're better off

29677

// just zero-extending, but for SSE just duplicating the top 16-bits is

29678

// cheaper and has the same effect for out of range values.

29679

if (Subtarget.hasAVX()) {

29680

SDValue Z = DAG.getConstant(0, dl, VT);

29681

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

29682

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

29683

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

29684

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

29685

} else {

29686

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

29687

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

29688

{4, 5, 6, 7, -1, -1, -1, -1});

29689

Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

29690

{0, 1, 1, 1, -1, -1, -1, -1});

29691

Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

29692

{2, 3, 3, 3, -1, -1, -1, -1});

29693

Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,

29694

{0, 1, 1, 1, -1, -1, -1, -1});

29695

Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,

29696

{2, 3, 3, 3, -1, -1, -1, -1});

29697

}

29698

}

29699

29700

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

29701

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

29702

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

29703

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

29704

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

29705

29706

// Merge the shifted lane results optimally with/without PBLENDW.

29707

// TODO - ideally shuffle combining would handle this.

29708

if (Subtarget.hasSSE41()) {

29709

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

29710

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

29711

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

29712

}

29713

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

29714

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

29715

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

29716

}

29717

29718

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

29719

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

29720

// make the existing SSE solution better.

29721

// NOTE: We honor prefered vector width before promoting to 512-bits.

29722

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

29723

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

29724

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

29725

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

29726

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

29727

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29728, __extension__
__PRETTY_FUNCTION__))

29728

"Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29728, __extension__
__PRETTY_FUNCTION__));

29729

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

29730

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

29731

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

29732

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

29733

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

29734

return DAG.getNode(ISD::TRUNCATE, dl, VT,

29735

DAG.getNode(Opc, dl, ExtVT, R, Amt));

29736

}

29737

29738

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

29739

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

29740

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

29741

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

29742

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

29743

!Subtarget.hasXOP()) {

29744

int NumElts = VT.getVectorNumElements();

29745

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

29746

29747

// Extend constant shift amount to vXi16 (it doesn't matter if the type

29748

// isn't legal).

29749

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

29750

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

29751

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

29752

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

29753

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29754, __extension__
__PRETTY_FUNCTION__))

29754

"Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29754, __extension__
__PRETTY_FUNCTION__));

29755

29756

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

29757

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

29758

: DAG.getZExtOrTrunc(R, dl, ExVT);

29759

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

29760

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

29761

return DAG.getZExtOrTrunc(R, dl, VT);

29762

}

29763

29764

SmallVector<SDValue, 16> LoAmt, HiAmt;

29765

for (int i = 0; i != NumElts; i += 16) {

29766

for (int j = 0; j != 8; ++j) {

29767

LoAmt.push_back(Amt.getOperand(i + j));

29768

HiAmt.push_back(Amt.getOperand(i + j + 8));

29769

}

29770

}

29771

29772

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

29773

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

29774

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

29775

29776

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

29777

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

29778

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

29779

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

29780

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

29781

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

29782

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

29783

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

29784

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

29785

}

29786

29787

if (VT == MVT::v16i8 ||

29788

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

29789

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

29790

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

29791

29792

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

29793

if (VT.is512BitVector()) {

29794

// On AVX512BW targets we make use of the fact that VSELECT lowers

29795

// to a masked blend which selects bytes based just on the sign bit

29796

// extracted to a mask.

29797

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

29798

V0 = DAG.getBitcast(VT, V0);

29799

V1 = DAG.getBitcast(VT, V1);

29800

Sel = DAG.getBitcast(VT, Sel);

29801

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

29802

ISD::SETGT);

29803

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

29804

} else if (Subtarget.hasSSE41()) {

29805

// On SSE41 targets we can use PBLENDVB which selects bytes based just

29806

// on the sign bit.

29807

V0 = DAG.getBitcast(VT, V0);

29808

V1 = DAG.getBitcast(VT, V1);

29809

Sel = DAG.getBitcast(VT, Sel);

29810

return DAG.getBitcast(SelVT,

29811

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

29812

}

29813

// On pre-SSE41 targets we test for the sign bit by comparing to

29814

// zero - a negative value will set all bits of the lanes to true

29815

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

29816

SDValue Z = DAG.getConstant(0, dl, SelVT);

29817

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

29818

return DAG.getSelect(dl, SelVT, C, V0, V1);

29819

};

29820

29821

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

29822

// We can safely do this using i16 shifts as we're only interested in

29823

// the 3 lower bits of each byte.

29824

Amt = DAG.getBitcast(ExtVT, Amt);

29825

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

29826

Amt = DAG.getBitcast(VT, Amt);

29827

29828

if (Opc == ISD::SHL || Opc == ISD::SRL) {

29829

// r = VSELECT(r, shift(r, 4), a);

29830

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

29831

R = SignBitSelect(VT, Amt, M, R);

29832

29833

// a += a

29834

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

29835

29836

// r = VSELECT(r, shift(r, 2), a);

29837

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

29838

R = SignBitSelect(VT, Amt, M, R);

29839

29840

// a += a

29841

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

29842

29843

// return VSELECT(r, shift(r, 1), a);

29844

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

29845

R = SignBitSelect(VT, Amt, M, R);

29846

return R;

29847

}

29848

29849

if (Opc == ISD::SRA) {

29850

// For SRA we need to unpack each byte to the higher byte of a i16 vector

29851

// so we can correctly sign extend. We don't care what happens to the

29852

// lower byte.

29853

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

29854

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

29855

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

29856

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

29857

ALo = DAG.getBitcast(ExtVT, ALo);

29858

AHi = DAG.getBitcast(ExtVT, AHi);

29859

RLo = DAG.getBitcast(ExtVT, RLo);

29860

RHi = DAG.getBitcast(ExtVT, RHi);

29861

29862

// r = VSELECT(r, shift(r, 4), a);

29863

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

29864

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

29865

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

29866

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

29867

29868

// a += a

29869

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

29870

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

29871

29872

// r = VSELECT(r, shift(r, 2), a);

29873

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

29874

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

29875

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

29876

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

29877

29878

// a += a

29879

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

29880

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

29881

29882

// r = VSELECT(r, shift(r, 1), a);

29883

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

29884

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

29885

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

29886

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

29887

29888

// Logical shift the result back to the lower byte, leaving a zero upper

29889

// byte meaning that we can safely pack with PACKUSWB.

29890

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

29891

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

29892

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

29893

}

29894

}

29895

29896

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

29897

MVT ExtVT = MVT::v8i32;

29898

SDValue Z = DAG.getConstant(0, dl, VT);

29899

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

29900

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

29901

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

29902

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

29903

ALo = DAG.getBitcast(ExtVT, ALo);

29904

AHi = DAG.getBitcast(ExtVT, AHi);

29905

RLo = DAG.getBitcast(ExtVT, RLo);

29906

RHi = DAG.getBitcast(ExtVT, RHi);

29907

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

29908

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

29909

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

29910

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

29911

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

29912

}

29913

29914

if (VT == MVT::v8i16) {

29915

// If we have a constant shift amount, the non-SSE41 path is best as

29916

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

29917

bool UseSSE41 = Subtarget.hasSSE41() &&

29918

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

29919

29920

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

29921

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

29922

// the sign bit.

29923

if (UseSSE41) {

29924

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

29925

V0 = DAG.getBitcast(ExtVT, V0);

29926

V1 = DAG.getBitcast(ExtVT, V1);

29927

Sel = DAG.getBitcast(ExtVT, Sel);

29928

return DAG.getBitcast(

29929

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

29930

}

29931

// On pre-SSE41 targets we splat the sign bit - a negative value will

29932

// set all bits of the lanes to true and VSELECT uses that in

29933

// its OR(AND(V0,C),AND(V1,~C)) lowering.

29934

SDValue C =

29935

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

29936

return DAG.getSelect(dl, VT, C, V0, V1);

29937

};

29938

29939

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

29940

if (UseSSE41) {

29941

// On SSE41 targets we need to replicate the shift mask in both

29942

// bytes for PBLENDVB.

29943

Amt = DAG.getNode(

29944

ISD::OR, dl, VT,

29945

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

29946

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

29947

} else {

29948

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

29949

}

29950

29951

// r = VSELECT(r, shift(r, 8), a);

29952

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

29953

R = SignBitSelect(Amt, M, R);

29954

29955

// a += a

29956

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

29957

29958

// r = VSELECT(r, shift(r, 4), a);

29959

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

29960

R = SignBitSelect(Amt, M, R);

29961

29962

// a += a

29963

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

29964

29965

// r = VSELECT(r, shift(r, 2), a);

29966

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

29967

R = SignBitSelect(Amt, M, R);

29968

29969

// a += a

29970

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

29971

29972

// return VSELECT(r, shift(r, 1), a);

29973

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

29974

R = SignBitSelect(Amt, M, R);

29975

return R;

29976

}

29977

29978

// Decompose 256-bit shifts into 128-bit shifts.

29979

if (VT.is256BitVector())

29980

return splitVectorIntBinary(Op, DAG);

29981

29982

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29983

return splitVectorIntBinary(Op, DAG);

29984

29985

return SDValue();

29986

}

29987

29988

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

29989

SelectionDAG &DAG) {

29990

MVT VT = Op.getSimpleValueType();

29991

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29992, __extension__
__PRETTY_FUNCTION__))

29992

"Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29992, __extension__
__PRETTY_FUNCTION__));

29993

29994

SDLoc DL(Op);

29995

SDValue Op0 = Op.getOperand(0);

29996

SDValue Op1 = Op.getOperand(1);

29997

SDValue Amt = Op.getOperand(2);

29998

unsigned EltSizeInBits = VT.getScalarSizeInBits();

29999

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

30000

30001

if (VT.isVector()) {

30002

APInt APIntShiftAmt;

30003

bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

30004

30005

if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {

30006

if (IsFSHR)

30007

std::swap(Op0, Op1);

30008

30009

if (IsCstSplat) {

30010

uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

30011

SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

30012

return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

30013

{Op0, Op1, Imm}, DAG, Subtarget);

30014

}

30015

return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

30016

{Op0, Op1, Amt}, DAG, Subtarget);

30017

}

30018

assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))

30019

VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))

30020

VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__))

30021

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30021, __extension__
__PRETTY_FUNCTION__));

30022

30023

// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

30024

// fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

30025

if (IsCstSplat)

30026

return SDValue();

30027

30028

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

30029

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

30030

bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());

30031

30032

// Constant vXi16 funnel shifts can be efficiently handled by default.

30033

if (IsCst && EltSizeInBits == 16)

30034

return SDValue();

30035

30036

unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

30037

unsigned NumElts = VT.getVectorNumElements();

30038

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

30039

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

30040

30041

// Split 256-bit integers on XOP/pre-AVX2 targets.

30042

// Split 512-bit integers on non 512-bit BWI targets.

30043

if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

30044

!Subtarget.hasAVX2())) ||

30045

(VT.is512BitVector() && !Subtarget.useBWIRegs() &&

30046

EltSizeInBits < 32)) {

30047

// Pre-mask the amount modulo using the wider vector.

30048

Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

30049

return splitVectorOp(Op, DAG);

30050

}

30051

30052

// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

30053

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

30054

int ScalarAmtIdx = -1;

30055

if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

30056

// Uniform vXi16 funnel shifts can be efficiently handled by default.

30057

if (EltSizeInBits == 16)

30058

return SDValue();

30059

30060

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

30061

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

30062

Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

30063

ScalarAmtIdx, Subtarget, DAG);

30064

Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

30065

ScalarAmtIdx, Subtarget, DAG);

30066

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

30067

}

30068

}

30069

30070

MVT WideSVT = MVT::getIntegerVT(

30071

std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

30072

MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);

30073

30074

// If per-element shifts are legal, fallback to generic expansion.

30075

if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

30076

return SDValue();

30077

30078

// Attempt to fold as:

30079

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

30080

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

30081

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

30082

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

30083

Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

30084

Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

30085

AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

30086

Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

30087

EltSizeInBits, DAG);

30088

SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

30089

Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

30090

if (!IsFSHR)

30091

Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

30092

EltSizeInBits, DAG);

30093

return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

30094

}

30095

30096

// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

30097

if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

30098

supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

30099

SDValue Z = DAG.getConstant(0, DL, VT);

30100

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

30101

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

30102

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

30103

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

30104

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

30105

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

30106

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

30107

}

30108

30109

// Fallback to generic expansion.

30110

return SDValue();

30111

}

30112

assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))

30113

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))

30114

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__));

30115

30116

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

30117

bool OptForSize = DAG.shouldOptForSize();

30118

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

30119

30120

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

30121

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

30122

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

30123

!isa<ConstantSDNode>(Amt)) {

30124

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

30125

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

30126

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

30127

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

30128

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

30129

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

30130

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

30131

if (IsFSHR) {

30132

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

30133

} else {

30134

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

30135

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

30136

}

30137

return DAG.getZExtOrTrunc(Res, DL, VT);

30138

}

30139

30140

if (VT == MVT::i8 || ExpandFunnel)

30141

return SDValue();

30142

30143

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

30144

if (VT == MVT::i16) {

30145

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

30146

DAG.getConstant(15, DL, Amt.getValueType()));

30147

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

30148

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

30149

}

30150

30151

return Op;

30152

}

30153

30154

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

30155

SelectionDAG &DAG) {

30156

MVT VT = Op.getSimpleValueType();

30157

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30157, __extension__
__PRETTY_FUNCTION__));

30158

30159

SDLoc DL(Op);

30160

SDValue R = Op.getOperand(0);

30161

SDValue Amt = Op.getOperand(1);

30162

unsigned Opcode = Op.getOpcode();

30163

unsigned EltSizeInBits = VT.getScalarSizeInBits();

30164

int NumElts = VT.getVectorNumElements();

30165

bool IsROTL = Opcode == ISD::ROTL;

30166

30167

// Check for constant splat rotation amount.

30168

APInt CstSplatValue;

30169

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

30170

30171

// Check for splat rotate by zero.

30172

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

30173

return R;

30174

30175

// AVX512 implicitly uses modulo rotation amounts.

30176

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

30177

// Attempt to rotate by immediate.

30178

if (IsCstSplat) {

30179

unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

30180

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

30181

return DAG.getNode(RotOpc, DL, VT, R,

30182

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

30183

}

30184

30185

// Else, fall-back on VPROLV/VPRORV.

30186

return Op;

30187

}

30188

30189

// AVX512 VBMI2 vXi16 - lower to funnel shifts.

30190

if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

30191

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

30192

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

30193

}

30194

30195

SDValue Z = DAG.getConstant(0, DL, VT);

30196

30197

if (!IsROTL) {

30198

// If the ISD::ROTR amount is constant, we're always better converting to

30199

// ISD::ROTL.

30200

if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

30201

return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);

30202

30203

// XOP targets always prefers ISD::ROTL.

30204

if (Subtarget.hasXOP())

30205

return DAG.getNode(ISD::ROTL, DL, VT, R,

30206

DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

30207

}

30208

30209

// Split 256-bit integers on XOP/pre-AVX2 targets.

30210

if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

30211

return splitVectorIntBinary(Op, DAG);

30212

30213

// XOP has 128-bit vector variable + immediate rotates.

30214

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

30215

// XOP implicitly uses modulo rotation amounts.

30216

if (Subtarget.hasXOP()) {

30217

assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30217, __extension__
__PRETTY_FUNCTION__));

30218

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30218, __extension__
__PRETTY_FUNCTION__));

30219

30220

// Attempt to rotate by immediate.

30221

if (IsCstSplat) {

30222

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

30223

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

30224

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

30225

}

30226

30227

// Use general rotate by variable (per-element).

30228

return Op;

30229

}

30230

30231

// Rotate by an uniform constant - expand back to shifts.

30232

if (IsCstSplat)

30233

return SDValue();

30234

30235

// Split 512-bit integers on non 512-bit BWI targets.

30236

if (VT.is512BitVector() && !Subtarget.useBWIRegs())

30237

return splitVectorIntBinary(Op, DAG);

30238

30239

assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))

30240

(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))

30241

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))

30242

Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))

30243

((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))

30244

"Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__));

30245

30246

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

30247

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

30248

30249

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

30250

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

30251

30252

// Attempt to fold as unpack(x,x) << zext(splat(y)):

30253

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

30254

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

30255

if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

30256

int BaseRotAmtIdx = -1;

30257

if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

30258

if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

30259

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

30260

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

30261

}

30262

unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

30263

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

30264

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

30265

Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

30266

BaseRotAmtIdx, Subtarget, DAG);

30267

Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

30268

BaseRotAmtIdx, Subtarget, DAG);

30269

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

30270

}

30271

}

30272

30273

// v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

30274

// the amount bit.

30275

// TODO: We're doing nothing here that we couldn't do for funnel shifts.

30276

if (EltSizeInBits == 8) {

30277

bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

30278

MVT WideVT =

30279

MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);

30280

unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;

30281

30282

// Attempt to fold as:

30283

// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

30284

// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

30285

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

30286

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

30287

// If we're rotating by constant, just use default promotion.

30288

if (IsConstAmt)

30289

return SDValue();

30290

// See if we can perform this by widening to vXi16 or vXi32.

30291

R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

30292

R = DAG.getNode(

30293

ISD::OR, DL, WideVT, R,

30294

getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

30295

Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

30296

R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

30297

if (IsROTL)

30298

R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

30299

return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

30300

}

30301

30302

// Attempt to fold as unpack(x,x) << zext(y):

30303

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

30304

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

30305

if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

30306

// See if we can perform this by unpacking to lo/hi vXi16.

30307

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

30308

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

30309

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

30310

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

30311

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

30312

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

30313

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

30314

}

30315

assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30315, __extension__
__PRETTY_FUNCTION__));

30316

30317

// We don't need ModuloAmt here as we just peek at individual bits.

30318

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

30319

if (Subtarget.hasSSE41()) {

30320

// On SSE41 targets we can use PBLENDVB which selects bytes based just

30321

// on the sign bit.

30322

V0 = DAG.getBitcast(VT, V0);

30323

V1 = DAG.getBitcast(VT, V1);

30324

Sel = DAG.getBitcast(VT, Sel);

30325

return DAG.getBitcast(SelVT,

30326

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

30327

}

30328

// On pre-SSE41 targets we test for the sign bit by comparing to

30329

// zero - a negative value will set all bits of the lanes to true

30330

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

30331

SDValue Z = DAG.getConstant(0, DL, SelVT);

30332

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

30333

return DAG.getSelect(DL, SelVT, C, V0, V1);

30334

};

30335

30336

// ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

30337

if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

30338

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

30339

IsROTL = true;

30340

}

30341

30342

unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

30343

unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;

30344

30345

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

30346

// We can safely do this using i16 shifts as we're only interested in

30347

// the 3 lower bits of each byte.

30348

Amt = DAG.getBitcast(ExtVT, Amt);

30349

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

30350

Amt = DAG.getBitcast(VT, Amt);

30351

30352

// r = VSELECT(r, rot(r, 4), a);

30353

SDValue M;

30354

M = DAG.getNode(

30355

ISD::OR, DL, VT,

30356

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

30357

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

30358

R = SignBitSelect(VT, Amt, M, R);

30359

30360

// a += a

30361

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

30362

30363

// r = VSELECT(r, rot(r, 2), a);

30364

M = DAG.getNode(

30365

ISD::OR, DL, VT,

30366

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

30367

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

30368

R = SignBitSelect(VT, Amt, M, R);

30369

30370

// a += a

30371

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

30372

30373

// return VSELECT(r, rot(r, 1), a);

30374

M = DAG.getNode(

30375

ISD::OR, DL, VT,

30376

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

30377

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

30378

return SignBitSelect(VT, Amt, M, R);

30379

}

30380

30381

bool IsSplatAmt = DAG.isSplatValue(Amt);

30382

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

30383

bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

30384

supportedVectorVarShift(VT, Subtarget, ISD::SRL);

30385

30386

// Fallback for splats + all supported variable shifts.

30387

// Fallback for non-constants AVX2 vXi16 as well.

30388

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

30389

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

30390

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

30391

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

30392

SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

30393

SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

30394

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

30395

}

30396

30397

// Everything below assumes ISD::ROTL.

30398

if (!IsROTL) {

30399

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

30400

IsROTL = true;

30401

}

30402

30403

// ISD::ROT* uses modulo rotate amounts.

30404

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

30405

30406

assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30406, __extension__
__PRETTY_FUNCTION__));

30407

30408

// As with shifts, attempt to convert the rotation amount to a multiplication

30409

// factor, fallback to general expansion.

30410

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

30411

if (!Scale)

30412

return SDValue();

30413

30414

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

30415

if (EltSizeInBits == 16) {

30416

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

30417

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

30418

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

30419

}

30420

30421

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

30422

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

30423

// that can then be OR'd with the lower 32-bits.

30424

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30424, __extension__
__PRETTY_FUNCTION__));

30425

static const int OddMask[] = {1, -1, 3, -1};

30426

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

30427

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

30428

30429

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

30430

DAG.getBitcast(MVT::v2i64, R),

30431

DAG.getBitcast(MVT::v2i64, Scale));

30432

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

30433

DAG.getBitcast(MVT::v2i64, R13),

30434

DAG.getBitcast(MVT::v2i64, Scale13));

30435

Res02 = DAG.getBitcast(VT, Res02);

30436

Res13 = DAG.getBitcast(VT, Res13);

30437

30438

return DAG.getNode(ISD::OR, DL, VT,

30439

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

30440

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

30441

}

30442

30443

/// Returns true if the operand type is exactly twice the native width, and

30444

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

30445

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

30446

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

30447

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

30448

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

30449

30450

if (OpWidth == 64)

30451

return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

30452

if (OpWidth == 128)

30453

return Subtarget.canUseCMPXCHG16B();

30454

30455

return false;

30456

}

30457

30458

TargetLoweringBase::AtomicExpansionKind

30459

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

30460

Type *MemType = SI->getValueOperand()->getType();

30461

30462

bool NoImplicitFloatOps =

30463

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

30464

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

30465

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

30466

(Subtarget.hasSSE1() || Subtarget.hasX87()))

30467

return AtomicExpansionKind::None;

30468

30469

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

30470

: AtomicExpansionKind::None;

30471

}

30472

30473

// Note: this turns large loads into lock cmpxchg8b/16b.

30474

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

30475

TargetLowering::AtomicExpansionKind

30476

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

30477

Type *MemType = LI->getType();

30478

30479

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

30480

// can use movq to do the load. If we have X87 we can load into an 80-bit

30481

// X87 register and store it to a stack temporary.

30482

bool NoImplicitFloatOps =

30483

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

30484

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

30485

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

30486

(Subtarget.hasSSE1() || Subtarget.hasX87()))

30487

return AtomicExpansionKind::None;

30488

30489

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

30490

: AtomicExpansionKind::None;

30491

}

30492

30493

TargetLowering::AtomicExpansionKind

30494

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

30495

// If the atomicrmw's result isn't actually used, we can just add a "lock"

30496

// prefix to a normal instruction for these operations.

30497

if (AI->use_empty())

30498

return AtomicExpansionKind::None;

30499

30500

// If the atomicrmw's result is used by a single bit AND, we may use

30501

// bts/btr/btc instruction for these operations.

30502

auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());

30503

Instruction *I = AI->user_back();

30504

if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||

30505

AI->getParent() != I->getParent())

30506

return AtomicExpansionKind::CmpXChg;

30507

// The following instruction must be a AND single bit.

30508

auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));

30509

unsigned Bits = AI->getType()->getPrimitiveSizeInBits();

30510

if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))

30511

return AtomicExpansionKind::CmpXChg;

30512

30513

if (AI->getOperation() == AtomicRMWInst::And)

30514

return ~C1->getValue() == C2->getValue()

30515

? AtomicExpansionKind::BitTestIntrinsic

30516

: AtomicExpansionKind::CmpXChg;

30517

30518

return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

30519

: AtomicExpansionKind::CmpXChg;

30520

}

30521

30522

void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

30523

IRBuilder<> Builder(AI);

30524

Intrinsic::ID IID = Intrinsic::not_intrinsic;

30525

switch (AI->getOperation()) {

30526

default:

30527

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 30527);

30528

case AtomicRMWInst::Or:

30529

IID = Intrinsic::x86_atomic_bts;

30530

break;

30531

case AtomicRMWInst::Xor:

30532

IID = Intrinsic::x86_atomic_btc;

30533

break;

30534

case AtomicRMWInst::And:

30535

IID = Intrinsic::x86_atomic_btr;

30536

break;

30537

}

30538

Instruction *I = AI->user_back();

30539

LLVMContext &Ctx = AI->getContext();

30540

unsigned Imm =

30541

countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());

30542

Function *BitTest =

30543

Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());

30544

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

30545

Type::getInt8PtrTy(Ctx));

30546

Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});

30547

I->replaceAllUsesWith(Result);

30548

I->eraseFromParent();

30549

AI->eraseFromParent();

30550

}

30551

30552

TargetLowering::AtomicExpansionKind

30553

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

30554

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

30555

Type *MemType = AI->getType();

30556

30557

// If the operand is too big, we must see if cmpxchg8/16b is available

30558

// and default to library calls otherwise.

30559

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

30560

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

30561

: AtomicExpansionKind::None;

30562

}

30563

30564

AtomicRMWInst::BinOp Op = AI->getOperation();

30565

switch (Op) {

30566

default:

30567

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 30567);

30568

case AtomicRMWInst::Xchg:

30569

case AtomicRMWInst::Add:

30570

case AtomicRMWInst::Sub:

30571

// It's better to use xadd, xsub or xchg for these in all cases.

30572

return AtomicExpansionKind::None;

30573

case AtomicRMWInst::Or:

30574

case AtomicRMWInst::And:

30575

case AtomicRMWInst::Xor:

30576

return shouldExpandLogicAtomicRMWInIR(AI);

30577

case AtomicRMWInst::Nand:

30578

case AtomicRMWInst::Max:

30579

case AtomicRMWInst::Min:

30580

case AtomicRMWInst::UMax:

30581

case AtomicRMWInst::UMin:

30582

case AtomicRMWInst::FAdd:

30583

case AtomicRMWInst::FSub:

30584

// These always require a non-trivial set of data operations on x86. We must

30585

// use a cmpxchg loop.

30586

return AtomicExpansionKind::CmpXChg;

30587

}

30588

}

30589

30590

LoadInst *

30591

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

30592

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

30593

Type *MemType = AI->getType();

30594

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

30595

// there is no benefit in turning such RMWs into loads, and it is actually

30596

// harmful as it introduces a mfence.

30597

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

30598

return nullptr;

30599

30600

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

30601

// lowering available in lowerAtomicArith.

30602

// TODO: push more cases through this path.

30603

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

30604

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

30605

AI->use_empty())

30606

return nullptr;

30607

30608

IRBuilder<> Builder(AI);

30609

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

30610

auto SSID = AI->getSyncScopeID();

30611

// We must restrict the ordering to avoid generating loads with Release or

30612

// ReleaseAcquire orderings.

30613

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

30614

30615

// Before the load we need a fence. Here is an example lifted from

30616

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

30617

// is required:

30618

// Thread 0:

30619

// x.store(1, relaxed);

30620

// r1 = y.fetch_add(0, release);

30621

// Thread 1:

30622

// y.fetch_add(42, acquire);

30623

// r2 = x.load(relaxed);

30624

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

30625

// lowered to just a load without a fence. A mfence flushes the store buffer,

30626

// making the optimization clearly correct.

30627

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

30628

// otherwise, we might be able to be more aggressive on relaxed idempotent

30629

// rmw. In practice, they do not look useful, so we don't try to be

30630

// especially clever.

30631

if (SSID == SyncScope::SingleThread)

30632

// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at

30633

// the IR level, so we must wrap it in an intrinsic.

30634

return nullptr;

30635

30636

if (!Subtarget.hasMFence())

30637

// FIXME: it might make sense to use a locked operation here but on a

30638

// different cache-line to prevent cache-line bouncing. In practice it

30639

// is probably a small win, and x86 processors without mfence are rare

30640

// enough that we do not bother.

30641

return nullptr;

30642

30643

Function *MFence =

30644

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

30645

Builder.CreateCall(MFence, {});

30646

30647

// Finally we can emit the atomic load.

30648

LoadInst *Loaded = Builder.CreateAlignedLoad(

30649

AI->getType(), AI->getPointerOperand(), AI->getAlign());

30650

Loaded->setAtomic(Order, SSID);

30651

AI->replaceAllUsesWith(Loaded);

30652

AI->eraseFromParent();

30653

return Loaded;

30654

}

30655

30656

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

30657

if (!SI.isUnordered())

30658

return false;

30659

return ExperimentalUnorderedISEL;

30660

}

30661

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

30662

if (!LI.isUnordered())

30663

return false;

30664

return ExperimentalUnorderedISEL;

30665

}

30666

30667

30668

/// Emit a locked operation on a stack location which does not change any

30669

/// memory location, but does involve a lock prefix. Location is chosen to be

30670

/// a) very likely accessed only by a single thread to minimize cache traffic,

30671

/// and b) definitely dereferenceable. Returns the new Chain result.

30672

static SDValue emitLockedStackOp(SelectionDAG &DAG,

30673

const X86Subtarget &Subtarget, SDValue Chain,

30674

const SDLoc &DL) {

30675

// Implementation notes:

30676

// 1) LOCK prefix creates a full read/write reordering barrier for memory

30677

// operations issued by the current processor. As such, the location

30678

// referenced is not relevant for the ordering properties of the instruction.

30679

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

30680

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

30681

// 2) Using an immediate operand appears to be the best encoding choice

30682

// here since it doesn't require an extra register.

30683

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

30684

// is small enough it might just be measurement noise.)

30685

// 4) When choosing offsets, there are several contributing factors:

30686

// a) If there's no redzone, we default to TOS. (We could allocate a cache

30687

// line aligned stack object to improve this case.)

30688

// b) To minimize our chances of introducing a false dependence, we prefer

30689

// to offset the stack usage from TOS slightly.

30690

// c) To minimize concerns about cross thread stack usage - in particular,

30691

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

30692

// captures state in the TOS frame and accesses it from many threads -

30693

// we want to use an offset such that the offset is in a distinct cache

30694

// line from the TOS frame.

30695

//

30696

// For a general discussion of the tradeoffs and benchmark results, see:

30697

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

30698

30699

auto &MF = DAG.getMachineFunction();

30700

auto &TFL = *Subtarget.getFrameLowering();

30701

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

30702

30703

if (Subtarget.is64Bit()) {

30704

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

30705

SDValue Ops[] = {

30706

DAG.getRegister(X86::RSP, MVT::i64), // Base

30707

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

30708

DAG.getRegister(0, MVT::i64), // Index

30709

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

30710

DAG.getRegister(0, MVT::i16), // Segment.

30711

Zero,

30712

Chain};

30713

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

30714

MVT::Other, Ops);

30715

return SDValue(Res, 1);

30716

}

30717

30718

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

30719

SDValue Ops[] = {

30720

DAG.getRegister(X86::ESP, MVT::i32), // Base

30721

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

30722

DAG.getRegister(0, MVT::i32), // Index

30723

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

30724

DAG.getRegister(0, MVT::i16), // Segment.

30725

Zero,

30726

Chain

30727

};

30728

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

30729

MVT::Other, Ops);

30730

return SDValue(Res, 1);

30731

}

30732

30733

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

30734

SelectionDAG &DAG) {

30735

SDLoc dl(Op);

30736

AtomicOrdering FenceOrdering =

30737

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

30738

SyncScope::ID FenceSSID =

30739

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

30740

30741

// The only fence that needs an instruction is a sequentially-consistent

30742

// cross-thread fence.

30743

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

30744

FenceSSID == SyncScope::System) {

30745

if (Subtarget.hasMFence())

30746

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

30747

30748

SDValue Chain = Op.getOperand(0);

30749

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

30750

}

30751

30752

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

30753

return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

30754

}

30755

30756

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

30757

SelectionDAG &DAG) {

30758

MVT T = Op.getSimpleValueType();

30759

SDLoc DL(Op);

30760

unsigned Reg = 0;

30761

unsigned size = 0;

30762

switch(T.SimpleTy) {

30763

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30763);

30764

case MVT::i8: Reg = X86::AL; size = 1; break;

30765

case MVT::i16: Reg = X86::AX; size = 2; break;

30766

case MVT::i32: Reg = X86::EAX; size = 4; break;

30767

case MVT::i64:

30768

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__));

30769

Reg = X86::RAX; size = 8;

30770

break;

30771

}

30772

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

30773

Op.getOperand(2), SDValue());

30774

SDValue Ops[] = { cpIn.getValue(0),

30775

Op.getOperand(1),

30776

Op.getOperand(3),

30777

DAG.getTargetConstant(size, DL, MVT::i8),

30778

cpIn.getValue(1) };

30779

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

30780

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

30781

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

30782

Ops, T, MMO);

30783

30784

SDValue cpOut =

30785

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

30786

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

30787

MVT::i32, cpOut.getValue(2));

30788

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

30789

30790

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

30791

cpOut, Success, EFLAGS.getValue(1));

30792

}

30793

30794

// Create MOVMSKB, taking into account whether we need to split for AVX1.

30795

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

30796

const X86Subtarget &Subtarget) {

30797

MVT InVT = V.getSimpleValueType();

30798

30799

if (InVT == MVT::v64i8) {

30800

SDValue Lo, Hi;

30801

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

30802

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

30803

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

30804

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

30805

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

30806

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

30807

DAG.getConstant(32, DL, MVT::i8));

30808

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

30809

}

30810

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

30811

SDValue Lo, Hi;

30812

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

30813

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

30814

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

30815

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

30816

DAG.getConstant(16, DL, MVT::i8));

30817

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

30818

}

30819

30820

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

30821

}

30822

30823

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

30824

SelectionDAG &DAG) {

30825

SDValue Src = Op.getOperand(0);

30826

MVT SrcVT = Src.getSimpleValueType();

30827

MVT DstVT = Op.getSimpleValueType();

30828

30829

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

30830

// half to v32i1 and concatenating the result.

30831

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

30832

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30832, __extension__
__PRETTY_FUNCTION__));

30833

assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30833, __extension__
__PRETTY_FUNCTION__));

30834

SDLoc dl(Op);

30835

SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

30836

DAG.getIntPtrConstant(0, dl));

30837

Lo = DAG.getBitcast(MVT::v32i1, Lo);

30838

SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

30839

DAG.getIntPtrConstant(1, dl));

30840

Hi = DAG.getBitcast(MVT::v32i1, Hi);

30841

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

30842

}

30843

30844

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

30845

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

30846

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30846, __extension__
__PRETTY_FUNCTION__));

30847

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

30848

SDLoc DL(Op);

30849

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

30850

V = getPMOVMSKB(DL, V, DAG, Subtarget);

30851

return DAG.getZExtOrTrunc(V, DL, DstVT);

30852

}

30853

30854

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30855, __extension__
__PRETTY_FUNCTION__))

30855

SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30855, __extension__
__PRETTY_FUNCTION__));

30856

30857

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30857, __extension__
__PRETTY_FUNCTION__));

30858

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

30859

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

30860

// This conversion needs to be expanded.

30861

return SDValue();

30862

30863

SDLoc dl(Op);

30864

if (SrcVT.isVector()) {

30865

// Widen the vector in input in the case of MVT::v2i32.

30866

// Example: from MVT::v2i32 to MVT::v4i32.

30867

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

30868

SrcVT.getVectorNumElements() * 2);

30869

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

30870

DAG.getUNDEF(SrcVT));

30871

} else {

30872

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30873, __extension__
__PRETTY_FUNCTION__))

30873

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30873, __extension__
__PRETTY_FUNCTION__));

30874

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

30875

}

30876

30877

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

30878

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

30879

30880

if (DstVT == MVT::x86mmx)

30881

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

30882

30883

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

30884

DAG.getIntPtrConstant(0, dl));

30885

}

30886

30887

/// Compute the horizontal sum of bytes in V for the elements of VT.

30888

///

30889

/// Requires V to be a byte vector and VT to be an integer vector type with

30890

/// wider elements than V's type. The width of the elements of VT determines

30891

/// how many bytes of V are summed horizontally to produce each element of the

30892

/// result.

30893

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

30894

const X86Subtarget &Subtarget,

30895

SelectionDAG &DAG) {

30896

SDLoc DL(V);

30897

MVT ByteVecVT = V.getSimpleValueType();

30898

MVT EltVT = VT.getVectorElementType();

30899

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30900, __extension__
__PRETTY_FUNCTION__))

30900

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30900, __extension__
__PRETTY_FUNCTION__));

30901

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30902, __extension__
__PRETTY_FUNCTION__))

30902

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30902, __extension__
__PRETTY_FUNCTION__));

30903

unsigned VecSize = VT.getSizeInBits();

30904

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30904, __extension__
__PRETTY_FUNCTION__));

30905

30906

// PSADBW instruction horizontally add all bytes and leave the result in i64

30907

// chunks, thus directly computes the pop count for v2i64 and v4i64.

30908

if (EltVT == MVT::i64) {

30909

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

30910

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

30911

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

30912

return DAG.getBitcast(VT, V);

30913

}

30914

30915

if (EltVT == MVT::i32) {

30916

// We unpack the low half and high half into i32s interleaved with zeros so

30917

// that we can use PSADBW to horizontally sum them. The most useful part of

30918

// this is that it lines up the results of two PSADBW instructions to be

30919

// two v2i64 vectors which concatenated are the 4 population counts. We can

30920

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

30921

SDValue Zeros = DAG.getConstant(0, DL, VT);

30922

SDValue V32 = DAG.getBitcast(VT, V);

30923

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

30924

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

30925

30926

// Do the horizontal sums into two v2i64s.

30927

Zeros = DAG.getConstant(0, DL, ByteVecVT);

30928

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

30929

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

30930

DAG.getBitcast(ByteVecVT, Low), Zeros);

30931

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

30932

DAG.getBitcast(ByteVecVT, High), Zeros);

30933

30934

// Merge them together.

30935

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

30936

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

30937

DAG.getBitcast(ShortVecVT, Low),

30938

DAG.getBitcast(ShortVecVT, High));

30939

30940

return DAG.getBitcast(VT, V);

30941

}

30942

30943

// The only element type left is i16.

30944

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__
__PRETTY_FUNCTION__));

30945

30946

// To obtain pop count for each i16 element starting from the pop count for

30947

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

30948

// right by 8. It is important to shift as i16s as i8 vector shift isn't

30949

// directly supported.

30950

SDValue ShifterV = DAG.getConstant(8, DL, VT);

30951

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

30952

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

30953

DAG.getBitcast(ByteVecVT, V));

30954

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

30955

}

30956

30957

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

30958

const X86Subtarget &Subtarget,

30959

SelectionDAG &DAG) {

30960

MVT VT = Op.getSimpleValueType();

30961

MVT EltVT = VT.getVectorElementType();

30962

int NumElts = VT.getVectorNumElements();

30963

(void)EltVT;

30964

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30964, __extension__
__PRETTY_FUNCTION__));

30965

30966

// Implement a lookup table in register by using an algorithm based on:

30967

// http://wm.ite.pl/articles/sse-popcount.html

30968

//

30969

// The general idea is that every lower byte nibble in the input vector is an

30970

// index into a in-register pre-computed pop count table. We then split up the

30971

// input vector in two new ones: (1) a vector with only the shifted-right

30972

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

30973

// masked out higher ones) for each byte. PSHUFB is used separately with both

30974

// to index the in-register table. Next, both are added and the result is a

30975

// i8 vector where each element contains the pop count for input byte.

30976

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

30977

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

30978

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

30979

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

30980

30981

SmallVector<SDValue, 64> LUTVec;

30982

for (int i = 0; i < NumElts; ++i)

30983

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

30984

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

30985

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

30986

30987

// High nibbles

30988

SDValue FourV = DAG.getConstant(4, DL, VT);

30989

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

30990

30991

// Low nibbles

30992

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

30993

30994

// The input vector is used as the shuffle mask that index elements into the

30995

// LUT. After counting low and high nibbles, add the vector to obtain the

30996

// final pop count per i8 element.

30997

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

30998

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

30999

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

31000

}

31001

31002

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

31003

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

31004

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

31005

SelectionDAG &DAG) {

31006

MVT VT = Op.getSimpleValueType();

31007

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31008, __extension__
__PRETTY_FUNCTION__))

31008

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31008, __extension__
__PRETTY_FUNCTION__));

31009

SDLoc DL(Op.getNode());

31010

SDValue Op0 = Op.getOperand(0);

31011

31012

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

31013

if (Subtarget.hasVPOPCNTDQ()) {

31014

unsigned NumElems = VT.getVectorNumElements();

31015

assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31016, __extension__
__PRETTY_FUNCTION__))

31016

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31016, __extension__
__PRETTY_FUNCTION__));

31017

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

31018

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

31019

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

31020

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

31021

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

31022

}

31023

}

31024

31025

// Decompose 256-bit ops into smaller 128-bit ops.

31026

if (VT.is256BitVector() && !Subtarget.hasInt256())

31027

return splitVectorIntUnary(Op, DAG);

31028

31029

// Decompose 512-bit ops into smaller 256-bit ops.

31030

if (VT.is512BitVector() && !Subtarget.hasBWI())

31031

return splitVectorIntUnary(Op, DAG);

31032

31033

// For element types greater than i8, do vXi8 pop counts and a bytesum.

31034

if (VT.getScalarType() != MVT::i8) {

31035

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

31036

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

31037

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

31038

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

31039

}

31040

31041

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

31042

if (!Subtarget.hasSSSE3())

31043

return SDValue();

31044

31045

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

31046

}

31047

31048

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

31049

SelectionDAG &DAG) {

31050

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31051, __extension__
__PRETTY_FUNCTION__))

31051

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31051, __extension__
__PRETTY_FUNCTION__));

31052

return LowerVectorCTPOP(Op, Subtarget, DAG);

31053

}

31054

31055

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

31056

MVT VT = Op.getSimpleValueType();

31057

SDValue In = Op.getOperand(0);

31058

SDLoc DL(Op);

31059

31060

// For scalars, its still beneficial to transfer to/from the SIMD unit to

31061

// perform the BITREVERSE.

31062

if (!VT.isVector()) {

31063

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

31064

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

31065

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

31066

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

31067

DAG.getIntPtrConstant(0, DL));

31068

}

31069

31070

int NumElts = VT.getVectorNumElements();

31071

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

31072

31073

// Decompose 256-bit ops into smaller 128-bit ops.

31074

if (VT.is256BitVector())

31075

return splitVectorIntUnary(Op, DAG);

31076

31077

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31078, __extension__
__PRETTY_FUNCTION__))

31078

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31078, __extension__
__PRETTY_FUNCTION__));

31079

31080

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

31081

// perform the BSWAP in the shuffle.

31082

// Its best to shuffle using the second operand as this will implicitly allow

31083

// memory folding for multiple vectors.

31084

SmallVector<SDValue, 16> MaskElts;

31085

for (int i = 0; i != NumElts; ++i) {

31086

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

31087

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

31088

int PermuteByte = SourceByte | (2 << 5);

31089

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

31090

}

31091

}

31092

31093

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

31094

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

31095

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

31096

Res, Mask);

31097

return DAG.getBitcast(VT, Res);

31098

}

31099

31100

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

31101

SelectionDAG &DAG) {

31102

MVT VT = Op.getSimpleValueType();

31103

31104

if (Subtarget.hasXOP() && !VT.is512BitVector())

31105

return LowerBITREVERSE_XOP(Op, DAG);

31106

31107

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31107, __extension__
__PRETTY_FUNCTION__));

31108

31109

SDValue In = Op.getOperand(0);

31110

SDLoc DL(Op);

31111

31112

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31113, __extension__
__PRETTY_FUNCTION__))

31113

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31113, __extension__
__PRETTY_FUNCTION__));

31114

31115

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

31116

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

31117

return splitVectorIntUnary(Op, DAG);

31118

31119

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

31120

if (VT == MVT::v32i8 && !Subtarget.hasInt256())

31121

return splitVectorIntUnary(Op, DAG);

31122

31123

unsigned NumElts = VT.getVectorNumElements();

31124

31125

// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

31126

if (Subtarget.hasGFNI()) {

31127

MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);

31128

SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);

31129

Matrix = DAG.getBitcast(VT, Matrix);

31130

return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

31131

DAG.getTargetConstant(0, DL, MVT::i8));

31132

}

31133

31134

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

31135

// two nibbles and a PSHUFB lookup to find the bitreverse of each

31136

// 0-15 value (moved to the other nibble).

31137

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

31138

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

31139

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

31140

31141

const int LoLUT[16] = {

31142

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

31143

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

31144

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

31145

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

31146

const int HiLUT[16] = {

31147

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

31148

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

31149

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

31150

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

31151

31152

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

31153

for (unsigned i = 0; i < NumElts; ++i) {

31154

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

31155

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

31156

}

31157

31158

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

31159

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

31160

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

31161

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

31162

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

31163

}

31164

31165

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

31166

SelectionDAG &DAG) {

31167

SDLoc DL(Op);

31168

SDValue X = Op.getOperand(0);

31169

MVT VT = Op.getSimpleValueType();

31170

31171

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

31172

if (VT == MVT::i8 ||

31173

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

31174

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

31175

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

31176

DAG.getConstant(0, DL, MVT::i8));

31177

// Copy the inverse of the parity flag into a register with setcc.

31178

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

31179

// Extend to the original type.

31180

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

31181

}

31182

31183

// If we have POPCNT, use the default expansion.

31184

if (Subtarget.hasPOPCNT())

31185

return SDValue();

31186

31187

if (VT == MVT::i64) {

31188

// Xor the high and low 16-bits together using a 32-bit operation.

31189

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

31190

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

31191

DAG.getConstant(32, DL, MVT::i8)));

31192

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

31193

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

31194

}

31195

31196

if (VT != MVT::i16) {

31197

// Xor the high and low 16-bits together using a 32-bit operation.

31198

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

31199

DAG.getConstant(16, DL, MVT::i8));

31200

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

31201

} else {

31202

// If the input is 16-bits, we need to extend to use an i32 shift below.

31203

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

31204

}

31205

31206

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

31207

// This should allow an h-reg to be used to save a shift.

31208

SDValue Hi = DAG.getNode(

31209

ISD::TRUNCATE, DL, MVT::i8,

31210

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

31211

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

31212

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

31213

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

31214

31215

// Copy the inverse of the parity flag into a register with setcc.

31216

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

31217

// Extend to the original type.

31218

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

31219

}

31220

31221

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

31222

const X86Subtarget &Subtarget) {

31223

unsigned NewOpc = 0;

31224

switch (N->getOpcode()) {

31225

case ISD::ATOMIC_LOAD_ADD:

31226

NewOpc = X86ISD::LADD;

31227

break;

31228

case ISD::ATOMIC_LOAD_SUB:

31229

NewOpc = X86ISD::LSUB;

31230

break;

31231

case ISD::ATOMIC_LOAD_OR:

31232

NewOpc = X86ISD::LOR;

31233

break;

31234

case ISD::ATOMIC_LOAD_XOR:

31235

NewOpc = X86ISD::LXOR;

31236

break;

31237

case ISD::ATOMIC_LOAD_AND:

31238

NewOpc = X86ISD::LAND;

31239

break;

31240

default:

31241

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31241);

31242

}

31243

31244

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

31245

31246

return DAG.getMemIntrinsicNode(

31247

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

31248

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

31249

/*MemVT=*/N->getSimpleValueType(0), MMO);

31250

}

31251

31252

/// Lower atomic_load_ops into LOCK-prefixed operations.

31253

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

31254

const X86Subtarget &Subtarget) {

31255

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

31256

SDValue Chain = N->getOperand(0);

31257

SDValue LHS = N->getOperand(1);

31258

SDValue RHS = N->getOperand(2);

31259

unsigned Opc = N->getOpcode();

31260

MVT VT = N->getSimpleValueType(0);

31261

SDLoc DL(N);

31262

31263

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

31264

// can only be lowered when the result is unused. They should have already

31265

// been transformed into a cmpxchg loop in AtomicExpand.

31266

if (N->hasAnyUseOfValue(0)) {

31267

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

31268

// select LXADD if LOCK_SUB can't be selected.

31269

if (Opc == ISD::ATOMIC_LOAD_SUB) {

31270

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

31271

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

31272

RHS, AN->getMemOperand());

31273

}

31274

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31275, __extension__
__PRETTY_FUNCTION__))

31275

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31275, __extension__
__PRETTY_FUNCTION__));

31276

return N;

31277

}

31278

31279

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

31280

// The core idea here is that since the memory location isn't actually

31281

// changing, all we need is a lowering for the *ordering* impacts of the

31282

// atomicrmw. As such, we can chose a different operation and memory

31283

// location to minimize impact on other code.

31284

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

31285

// On X86, the only ordering which actually requires an instruction is

31286

// seq_cst which isn't SingleThread, everything just needs to be preserved

31287

// during codegen and then dropped. Note that we expect (but don't assume),

31288

// that orderings other than seq_cst and acq_rel have been canonicalized to

31289

// a store or load.

31290

if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

31291

AN->getSyncScopeID() == SyncScope::System) {

31292

// Prefer a locked operation against a stack location to minimize cache

31293

// traffic. This assumes that stack locations are very likely to be

31294

// accessed only by the owning thread.

31295

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

31296

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31296, __extension__ __PRETTY_FUNCTION__));

31297

// NOTE: The getUNDEF is needed to give something for the unused result 0.

31298

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

31299

DAG.getUNDEF(VT), NewChain);

31300

}

31301

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

31302

SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);

31303

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31303, __extension__ __PRETTY_FUNCTION__));

31304

// NOTE: The getUNDEF is needed to give something for the unused result 0.

31305

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

31306

DAG.getUNDEF(VT), NewChain);

31307

}

31308

31309

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

31310

// RAUW the chain, but don't worry about the result, as it's unused.

31311

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31311, __extension__ __PRETTY_FUNCTION__));

31312

// NOTE: The getUNDEF is needed to give something for the unused result 0.

31313

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

31314

DAG.getUNDEF(VT), LockOp.getValue(1));

31315

}

31316

31317

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

31318

const X86Subtarget &Subtarget) {

31319

auto *Node = cast<AtomicSDNode>(Op.getNode());

31320

SDLoc dl(Node);

31321

EVT VT = Node->getMemoryVT();

31322

31323

bool IsSeqCst =

31324

Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

31325

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

31326

31327

// If this store is not sequentially consistent and the type is legal

31328

// we can just keep it.

31329

if (!IsSeqCst && IsTypeLegal)

31330

return Op;

31331

31332

if (VT == MVT::i64 && !IsTypeLegal) {

31333

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

31334

// is enabled.

31335

bool NoImplicitFloatOps =

31336

DAG.getMachineFunction().getFunction().hasFnAttribute(

31337

Attribute::NoImplicitFloat);

31338

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

31339

SDValue Chain;

31340

if (Subtarget.hasSSE1()) {

31341

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

31342

Node->getOperand(2));

31343

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

31344

SclToVec = DAG.getBitcast(StVT, SclToVec);

31345

SDVTList Tys = DAG.getVTList(MVT::Other);

31346

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

31347

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

31348

MVT::i64, Node->getMemOperand());

31349

} else if (Subtarget.hasX87()) {

31350

// First load this into an 80-bit X87 register using a stack temporary.

31351

// This will put the whole integer into the significand.

31352

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

31353

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

31354

MachinePointerInfo MPI =

31355

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

31356

Chain =

31357

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

31358

MPI, MaybeAlign(), MachineMemOperand::MOStore);

31359

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

31360

SDValue LdOps[] = {Chain, StackPtr};

31361

SDValue Value =

31362

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

31363

/*Align*/ None, MachineMemOperand::MOLoad);

31364

Chain = Value.getValue(1);

31365

31366

// Now use an FIST to do the atomic store.

31367

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

31368

Chain =

31369

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

31370

StoreOps, MVT::i64, Node->getMemOperand());

31371

}

31372

31373

if (Chain) {

31374

// If this is a sequentially consistent store, also emit an appropriate

31375

// barrier.

31376

if (IsSeqCst)

31377

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

31378

31379

return Chain;

31380

}

31381

}

31382

}

31383

31384

// Convert seq_cst store -> xchg

31385

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

31386

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

31387

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

31388

Node->getMemoryVT(),

31389

Node->getOperand(0),

31390

Node->getOperand(1), Node->getOperand(2),

31391

Node->getMemOperand());

31392

return Swap.getValue(1);

31393

}

31394

31395

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

31396

SDNode *N = Op.getNode();

31397

MVT VT = N->getSimpleValueType(0);

31398

unsigned Opc = Op.getOpcode();

31399

31400

// Let legalize expand this if it isn't a legal type yet.

31401

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

31402

return SDValue();

31403

31404

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

31405

SDLoc DL(N);

31406

31407

// Set the carry flag.

31408

SDValue Carry = Op.getOperand(2);

31409

EVT CarryVT = Carry.getValueType();

31410

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

31411

Carry, DAG.getAllOnesConstant(DL, CarryVT));

31412

31413

bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;

31414

SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

31415

Op.getOperand(0), Op.getOperand(1),

31416

Carry.getValue(1));

31417

31418

bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

31419

SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

31420

Sum.getValue(1), DL, DAG);

31421

if (N->getValueType(1) == MVT::i1)

31422

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

31423

31424

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

31425

}

31426

31427

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

31428

SelectionDAG &DAG) {

31429

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31429, __extension__
__PRETTY_FUNCTION__));

31430

31431

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

31432

// which returns the values as { float, float } (in XMM0) or

31433

// { double, double } (which is returned in XMM0, XMM1).

31434

SDLoc dl(Op);

31435

SDValue Arg = Op.getOperand(0);

31436

EVT ArgVT = Arg.getValueType();

31437

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

31438

31439

TargetLowering::ArgListTy Args;

31440

TargetLowering::ArgListEntry Entry;

31441

31442

Entry.Node = Arg;

31443

Entry.Ty = ArgTy;

31444

Entry.IsSExt = false;

31445

Entry.IsZExt = false;

31446

Args.push_back(Entry);

31447

31448

bool isF64 = ArgVT == MVT::f64;

31449

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

31450

// the small struct {f32, f32} is returned in (eax, edx). For f64,

31451

// the results are returned via SRet in memory.

31452

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

31453

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

31454

const char *LibcallName = TLI.getLibcallName(LC);

31455

SDValue Callee =

31456

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

31457

31458

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

31459

: (Type *)FixedVectorType::get(ArgTy, 4);

31460

31461

TargetLowering::CallLoweringInfo CLI(DAG);

31462

CLI.setDebugLoc(dl)

31463

.setChain(DAG.getEntryNode())

31464

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

31465

31466

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

31467

31468

if (isF64)

31469

// Returned in xmm0 and xmm1.

31470

return CallResult.first;

31471

31472

// Returned in bits 0:31 and 32:64 xmm0.

31473

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

31474

CallResult.first, DAG.getIntPtrConstant(0, dl));

31475

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

31476

CallResult.first, DAG.getIntPtrConstant(1, dl));

31477

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

31478

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

31479

}

31480

31481

/// Widen a vector input to a vector of NVT. The

31482

/// input vector must have the same element type as NVT.

31483

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

31484

bool FillWithZeroes = false) {

31485

// Check if InOp already has the right width.

31486

MVT InVT = InOp.getSimpleValueType();

31487

if (InVT == NVT)

31488

return InOp;

31489

31490

if (InOp.isUndef())

31491

return DAG.getUNDEF(NVT);

31492

31493

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31494, __extension__
__PRETTY_FUNCTION__))

31494

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31494, __extension__
__PRETTY_FUNCTION__));

31495

31496

unsigned InNumElts = InVT.getVectorNumElements();

31497

unsigned WidenNumElts = NVT.getVectorNumElements();

31498

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31499, __extension__
__PRETTY_FUNCTION__))

31499

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31499, __extension__
__PRETTY_FUNCTION__));

31500

31501

SDLoc dl(InOp);

31502

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

31503

InOp.getNumOperands() == 2) {

31504

SDValue N1 = InOp.getOperand(1);

31505

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

31506

N1.isUndef()) {

31507

InOp = InOp.getOperand(0);

31508

InVT = InOp.getSimpleValueType();

31509

InNumElts = InVT.getVectorNumElements();

31510

}

31511

}

31512

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

31513

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

31514

SmallVector<SDValue, 16> Ops;

31515

for (unsigned i = 0; i < InNumElts; ++i)

31516

Ops.push_back(InOp.getOperand(i));

31517

31518

EVT EltVT = InOp.getOperand(0).getValueType();

31519

31520

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

31521

DAG.getUNDEF(EltVT);

31522

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

31523

Ops.push_back(FillVal);

31524

return DAG.getBuildVector(NVT, dl, Ops);

31525

}

31526

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

31527

DAG.getUNDEF(NVT);

31528

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

31529

InOp, DAG.getIntPtrConstant(0, dl));

31530

}

31531

31532

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

31533

SelectionDAG &DAG) {

31534

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__))

31535

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__));

31536

31537

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

31538

SDValue Src = N->getValue();

31539

MVT VT = Src.getSimpleValueType();

31540

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31540, __extension__
__PRETTY_FUNCTION__));

31541

SDLoc dl(Op);

31542

31543

SDValue Scale = N->getScale();

31544

SDValue Index = N->getIndex();

31545

SDValue Mask = N->getMask();

31546

SDValue Chain = N->getChain();

31547

SDValue BasePtr = N->getBasePtr();

31548

31549

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

31550

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31550, __extension__
__PRETTY_FUNCTION__));

31551

// If the index is v2i64 and we have VLX we can use xmm for data and index.

31552

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

31553

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

31554

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

31555

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

31556

SDVTList VTs = DAG.getVTList(MVT::Other);

31557

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

31558

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

31559

N->getMemoryVT(), N->getMemOperand());

31560

}

31561

return SDValue();

31562

}

31563

31564

MVT IndexVT = Index.getSimpleValueType();

31565

31566

// If the index is v2i32, we're being called by type legalization and we

31567

// should just let the default handling take care of it.

31568

if (IndexVT == MVT::v2i32)

31569

return SDValue();

31570

31571

// If we don't have VLX and neither the passthru or index is 512-bits, we

31572

// need to widen until one is.

31573

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

31574

!Index.getSimpleValueType().is512BitVector()) {

31575

// Determine how much we need to widen by to get a 512-bit type.

31576

unsigned Factor = std::min(512/VT.getSizeInBits(),

31577

512/IndexVT.getSizeInBits());

31578

unsigned NumElts = VT.getVectorNumElements() * Factor;

31579

31580

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

31581

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

31582

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

31583

31584

Src = ExtendToType(Src, VT, DAG);

31585

Index = ExtendToType(Index, IndexVT, DAG);

31586

Mask = ExtendToType(Mask, MaskVT, DAG, true);

31587

}

31588

31589

SDVTList VTs = DAG.getVTList(MVT::Other);

31590

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

31591

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

31592

N->getMemoryVT(), N->getMemOperand());

31593

}

31594

31595

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

31596

SelectionDAG &DAG) {

31597

31598

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

31599

MVT VT = Op.getSimpleValueType();

31600

MVT ScalarVT = VT.getScalarType();

31601

SDValue Mask = N->getMask();

31602

MVT MaskVT = Mask.getSimpleValueType();

31603

SDValue PassThru = N->getPassThru();

31604

SDLoc dl(Op);

31605

31606

// Handle AVX masked loads which don't support passthru other than 0.

31607

if (MaskVT.getVectorElementType() != MVT::i1) {

31608

// We also allow undef in the isel pattern.

31609

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

31610

return Op;

31611

31612

SDValue NewLoad = DAG.getMaskedLoad(

31613

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

31614

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

31615

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

31616

N->isExpandingLoad());

31617

// Emit a blend.

31618

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

31619

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

31620

}

31621

31622

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31623, __extension__
__PRETTY_FUNCTION__))

31623

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31623, __extension__
__PRETTY_FUNCTION__));

31624

31625

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31626, __extension__
__PRETTY_FUNCTION__))

31626

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31626, __extension__
__PRETTY_FUNCTION__));

31627

31628

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31629, __extension__
__PRETTY_FUNCTION__))

31629

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31629, __extension__
__PRETTY_FUNCTION__));

31630

31631

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))

31632

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))

31633

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))

31634

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__));

31635

31636

// This operation is legal for targets with VLX, but without

31637

// VLX the vector should be widened to 512 bit

31638

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

31639

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

31640

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

31641

31642

// Mask element has to be i1.

31643

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31644, __extension__
__PRETTY_FUNCTION__))

31644

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31644, __extension__
__PRETTY_FUNCTION__));

31645

31646

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

31647

31648

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

31649

SDValue NewLoad = DAG.getMaskedLoad(

31650

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

31651

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

31652

N->getExtensionType(), N->isExpandingLoad());

31653

31654

SDValue Extract =

31655

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

31656

DAG.getIntPtrConstant(0, dl));

31657

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

31658

return DAG.getMergeValues(RetOps, dl);

31659

}

31660

31661

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

31662

SelectionDAG &DAG) {

31663

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

31664

SDValue DataToStore = N->getValue();

31665

MVT VT = DataToStore.getSimpleValueType();

31666

MVT ScalarVT = VT.getScalarType();

31667

SDValue Mask = N->getMask();

31668

SDLoc dl(Op);

31669

31670

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__))

31671

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__));

31672

31673

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31674, __extension__
__PRETTY_FUNCTION__))

31674

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31674, __extension__
__PRETTY_FUNCTION__));

31675

31676

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31677, __extension__
__PRETTY_FUNCTION__))

31677

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31677, __extension__
__PRETTY_FUNCTION__));

31678

31679

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))

31680

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))

31681

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__))

31682

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31682, __extension__
__PRETTY_FUNCTION__));

31683

31684

// This operation is legal for targets with VLX, but without

31685

// VLX the vector should be widened to 512 bit

31686

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

31687

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

31688

31689

// Mask element has to be i1.

31690

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31691, __extension__
__PRETTY_FUNCTION__))

31691

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31691, __extension__
__PRETTY_FUNCTION__));

31692

31693

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

31694

31695

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

31696

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

31697

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

31698

N->getOffset(), Mask, N->getMemoryVT(),

31699

N->getMemOperand(), N->getAddressingMode(),

31700

N->isTruncatingStore(), N->isCompressingStore());

31701

}

31702

31703

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

31704

SelectionDAG &DAG) {

31705

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31706, __extension__
__PRETTY_FUNCTION__))

31706

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31706, __extension__
__PRETTY_FUNCTION__));

31707

31708

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

31709

SDLoc dl(Op);

31710

MVT VT = Op.getSimpleValueType();

31711

SDValue Index = N->getIndex();

31712

SDValue Mask = N->getMask();

31713

SDValue PassThru = N->getPassThru();

31714

MVT IndexVT = Index.getSimpleValueType();

31715

31716

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31716, __extension__
__PRETTY_FUNCTION__));

31717

31718

// If the index is v2i32, we're being called by type legalization.

31719

if (IndexVT == MVT::v2i32)

31720

return SDValue();

31721

31722

// If we don't have VLX and neither the passthru or index is 512-bits, we

31723

// need to widen until one is.

31724

MVT OrigVT = VT;

31725

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

31726

!IndexVT.is512BitVector()) {

31727

// Determine how much we need to widen by to get a 512-bit type.

31728

unsigned Factor = std::min(512/VT.getSizeInBits(),

31729

512/IndexVT.getSizeInBits());

31730

31731

unsigned NumElts = VT.getVectorNumElements() * Factor;

31732

31733

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

31734

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

31735

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

31736

31737

PassThru = ExtendToType(PassThru, VT, DAG);

31738

Index = ExtendToType(Index, IndexVT, DAG);

31739

Mask = ExtendToType(Mask, MaskVT, DAG, true);

31740

}

31741

31742

// Break dependency on the data register.

31743

if (PassThru.isUndef())

31744

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

31745

31746

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

31747

N->getScale() };

31748

SDValue NewGather = DAG.getMemIntrinsicNode(

31749

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

31750

N->getMemOperand());

31751

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

31752

NewGather, DAG.getIntPtrConstant(0, dl));

31753

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

31754

}

31755

31756

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

31757

SDLoc dl(Op);

31758

SDValue Src = Op.getOperand(0);

31759

MVT DstVT = Op.getSimpleValueType();

31760

31761

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

31762

unsigned SrcAS = N->getSrcAddressSpace();

31763

31764

assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31765, __extension__
__PRETTY_FUNCTION__))

31765

"addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31765, __extension__
__PRETTY_FUNCTION__));

31766

31767

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

31768

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

31769

} else if (DstVT == MVT::i64) {

31770

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

31771

} else if (DstVT == MVT::i32) {

31772

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

31773

} else {

31774

report_fatal_error("Bad address space in addrspacecast");

31775

}

31776

return Op;

31777

}

31778

31779

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

31780

SelectionDAG &DAG) const {

31781

// TODO: Eventually, the lowering of these nodes should be informed by or

31782

// deferred to the GC strategy for the function in which they appear. For

31783

// now, however, they must be lowered to something. Since they are logically

31784

// no-ops in the case of a null GC strategy (or a GC strategy which does not

31785

// require special handling for these nodes), lower them as literal NOOPs for

31786

// the time being.

31787

SmallVector<SDValue, 2> Ops;

31788

31789

Ops.push_back(Op.getOperand(0));

31790

if (Op->getGluedNode())

31791

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

31792

31793

SDLoc OpDL(Op);

31794

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

31795

SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

31796

31797

return NOOP;

31798

}

31799

31800

// Custom split CVTPS2PH with wide types.

31801

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

31802

SDLoc dl(Op);

31803

EVT VT = Op.getValueType();

31804

SDValue Lo, Hi;

31805

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

31806

EVT LoVT, HiVT;

31807

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

31808

SDValue RC = Op.getOperand(1);

31809

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

31810

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

31811

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

31812

}

31813

31814

/// Provide custom lowering hooks for some operations.

31815

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

31816

switch (Op.getOpcode()) {

31817

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31817);

31818

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

31819

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

31820

return LowerCMP_SWAP(Op, Subtarget, DAG);

31821

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

31822

case ISD::ATOMIC_LOAD_ADD:

31823

case ISD::ATOMIC_LOAD_SUB:

31824

case ISD::ATOMIC_LOAD_OR:

31825

case ISD::ATOMIC_LOAD_XOR:

31826

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

31827

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

31828

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

31829

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

31830

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

31831

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

31832

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

31833

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

31834

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

31835

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

31836

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

31837

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

31838

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

31839

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

31840

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

31841

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

31842

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

31843

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

31844

case ISD::SHL_PARTS:

31845

case ISD::SRA_PARTS:

31846

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

31847

case ISD::FSHL:

31848

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

31849

case ISD::STRICT_SINT_TO_FP:

31850

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

31851

case ISD::STRICT_UINT_TO_FP:

31852

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

31853

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

31854

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

31855

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

31856

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

31857

case ISD::ZERO_EXTEND_VECTOR_INREG:

31858

case ISD::SIGN_EXTEND_VECTOR_INREG:

31859

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

31860

case ISD::FP_TO_SINT:

31861

case ISD::STRICT_FP_TO_SINT:

31862

case ISD::FP_TO_UINT:

31863

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

31864

case ISD::FP_TO_SINT_SAT:

31865

case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);

31866

case ISD::FP_EXTEND:

31867

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

31868

case ISD::FP_ROUND:

31869

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

31870

case ISD::FP16_TO_FP:

31871

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

31872

case ISD::FP_TO_FP16:

31873

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

31874

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

31875

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

31876

case ISD::FADD:

31877

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

31878

case ISD::FROUND: return LowerFROUND(Op, DAG);

31879

case ISD::FABS:

31880

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

31881

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

31882

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

31883

case ISD::LRINT:

31884

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

31885

case ISD::SETCC:

31886

case ISD::STRICT_FSETCC:

31887

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

31888

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

31889

case ISD::SELECT: return LowerSELECT(Op, DAG);

31890

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

31891

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

31892

case ISD::VASTART: return LowerVASTART(Op, DAG);

31893

case ISD::VAARG: return LowerVAARG(Op, DAG);

31894

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

31895

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

31896

case ISD::INTRINSIC_VOID:

31897

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

31898

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

31899

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

31900

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

31901

case ISD::FRAME_TO_ARGS_OFFSET:

31902

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

31903

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

31904

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

31905

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

31906

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

31907

case ISD::EH_SJLJ_SETUP_DISPATCH:

31908

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

31909

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

31910

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

31911

case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);

31912

case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);

31913

case ISD::CTLZ:

31914

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

31915

case ISD::CTTZ:

31916

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

31917

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

31918

case ISD::MULHS:

31919

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

31920

case ISD::ROTL:

31921

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

31922

case ISD::SRA:

31923

case ISD::SRL:

31924

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

31925

case ISD::SADDO:

31926

case ISD::UADDO:

31927

case ISD::SSUBO:

31928

case ISD::USUBO: return LowerXALUO(Op, DAG);

31929

case ISD::SMULO:

31930

case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);

31931

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

31932

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

31933

case ISD::SADDO_CARRY:

31934

case ISD::SSUBO_CARRY:

31935

case ISD::ADDCARRY:

31936

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

31937

case ISD::ADD:

31938

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

31939

case ISD::UADDSAT:

31940

case ISD::SADDSAT:

31941

case ISD::USUBSAT:

31942

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

31943

case ISD::SMAX:

31944

case ISD::SMIN:

31945

case ISD::UMAX:

31946

case ISD::UMIN: return LowerMINMAX(Op, DAG);

31947

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

31948

case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);

31949

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

31950

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

31951

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

31952

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

31953

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

31954

case ISD::GC_TRANSITION_START:

31955

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

31956

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

31957

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

31958

}

31959

}

31960

31961

/// Replace a node with an illegal result type with a new node built out of

31962

/// custom code.

31963

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

31964

SmallVectorImpl<SDValue>&Results,

31965

SelectionDAG &DAG) const {

31966

SDLoc dl(N);

31967

switch (N->getOpcode()) {

31968

default:

31969

#ifndef NDEBUG

31970

dbgs() << "ReplaceNodeResults: ";

31971

N->dump(&DAG);

31972

#endif

31973

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31973);

31974

case X86ISD::CVTPH2PS: {

31975

EVT VT = N->getValueType(0);

31976

SDValue Lo, Hi;

31977

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

31978

EVT LoVT, HiVT;

31979

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

31980

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

31981

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

31982

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

31983

Results.push_back(Res);

31984

return;

31985

}

31986

case X86ISD::STRICT_CVTPH2PS: {

31987

EVT VT = N->getValueType(0);

31988

SDValue Lo, Hi;

31989

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

31990

EVT LoVT, HiVT;

31991

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

31992

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

31993

{N->getOperand(0), Lo});

31994

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

31995

{N->getOperand(0), Hi});

31996

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

31997

Lo.getValue(1), Hi.getValue(1));

31998

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

31999

Results.push_back(Res);

32000

Results.push_back(Chain);

32001

return;

32002

}

32003

case X86ISD::CVTPS2PH:

32004

Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

32005

return;

32006

case ISD::CTPOP: {

32007

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32007, __extension__
__PRETTY_FUNCTION__));

32008

// Use a v2i64 if possible.

32009

bool NoImplicitFloatOps =

32010

DAG.getMachineFunction().getFunction().hasFnAttribute(

32011

Attribute::NoImplicitFloat);

32012

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

32013

SDValue Wide =

32014

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

32015

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

32016

// Bit count should fit in 32-bits, extract it as that and then zero

32017

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

32018

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

32019

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

32020

DAG.getIntPtrConstant(0, dl));

32021

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

32022

Results.push_back(Wide);

32023

}

32024

return;

32025

}

32026

case ISD::MUL: {

32027

EVT VT = N->getValueType(0);

32028

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32029, __extension__
__PRETTY_FUNCTION__))

32029

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32029, __extension__
__PRETTY_FUNCTION__));

32030

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

32031

// elements are needed.

32032

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

32033

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

32034

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

32035

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

32036

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

32037

unsigned NumConcats = 16 / VT.getVectorNumElements();

32038

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

32039

ConcatOps[0] = Res;

32040

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

32041

Results.push_back(Res);

32042

return;

32043

}

32044

case X86ISD::VPMADDWD: {

32045

// Legalize types for X86ISD::VPMADDWD by widening.

32046

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32046, __extension__
__PRETTY_FUNCTION__));

32047

32048

EVT VT = N->getValueType(0);

32049

EVT InVT = N->getOperand(0).getValueType();

32050

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__))

32051

"Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__
__PRETTY_FUNCTION__));

32052

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32053, __extension__
__PRETTY_FUNCTION__))

32053

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32053, __extension__
__PRETTY_FUNCTION__));

32054

unsigned NumConcat = 128 / InVT.getSizeInBits();

32055

32056

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

32057

InVT.getVectorElementType(),

32058

NumConcat * InVT.getVectorNumElements());

32059

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

32060

VT.getVectorElementType(),

32061

NumConcat * VT.getVectorNumElements());

32062

32063

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

32064

Ops[0] = N->getOperand(0);

32065

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

32066

Ops[0] = N->getOperand(1);

32067

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

32068

32069

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

32070

Results.push_back(Res);

32071

return;

32072

}

32073

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

32074

case X86ISD::FMINC:

32075

case X86ISD::FMIN:

32076

case X86ISD::FMAXC:

32077

case X86ISD::FMAX: {

32078

EVT VT = N->getValueType(0);

32079

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32079, __extension__
__PRETTY_FUNCTION__));

32080

SDValue UNDEF = DAG.getUNDEF(VT);

32081

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

32082

N->getOperand(0), UNDEF);

32083

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

32084

N->getOperand(1), UNDEF);

32085

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

32086

return;

32087

}

32088

case ISD::SDIV:

32089

case ISD::UDIV:

32090

case ISD::SREM:

32091

case ISD::UREM: {

32092

EVT VT = N->getValueType(0);

32093

if (VT.isVector()) {

32094

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32095, __extension__
__PRETTY_FUNCTION__))

32095

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32095, __extension__
__PRETTY_FUNCTION__));

32096

// If this RHS is a constant splat vector we can widen this and let

32097

// division/remainder by constant optimize it.

32098

// TODO: Can we do something for non-splat?

32099

APInt SplatVal;

32100

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

32101

unsigned NumConcats = 128 / VT.getSizeInBits();

32102

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

32103

Ops0[0] = N->getOperand(0);

32104

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

32105

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

32106

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

32107

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

32108

Results.push_back(Res);

32109

}

32110

return;

32111

}

32112

32113

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

32114

Results.push_back(V);

32115

return;

32116

}

32117

case ISD::TRUNCATE: {

32118

MVT VT = N->getSimpleValueType(0);

32119

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

32120

return;

32121

32122

// The generic legalizer will try to widen the input type to the same

32123

// number of elements as the widened result type. But this isn't always

32124

// the best thing so do some custom legalization to avoid some cases.

32125

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

32126

SDValue In = N->getOperand(0);

32127

EVT InVT = In.getValueType();

32128

32129

unsigned InBits = InVT.getSizeInBits();

32130

if (128 % InBits == 0) {

32131

// 128 bit and smaller inputs should avoid truncate all together and

32132

// just use a build_vector that will become a shuffle.

32133

// TODO: Widen and use a shuffle directly?

32134

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

32135

EVT EltVT = VT.getVectorElementType();

32136

unsigned WidenNumElts = WidenVT.getVectorNumElements();

32137

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

32138

// Use the original element count so we don't do more scalar opts than

32139

// necessary.

32140

unsigned MinElts = VT.getVectorNumElements();

32141

for (unsigned i=0; i < MinElts; ++i) {

32142

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

32143

DAG.getIntPtrConstant(i, dl));

32144

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

32145

}

32146

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

32147

return;

32148

}

32149

// With AVX512 there are some cases that can use a target specific

32150

// truncate node to go from 256/512 to less than 128 with zeros in the

32151

// upper elements of the 128 bit result.

32152

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

32153

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

32154

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

32155

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

32156

return;

32157

}

32158

// There's one case we can widen to 512 bits and use VTRUNC.

32159

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

32160

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

32161

DAG.getUNDEF(MVT::v4i64));

32162

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

32163

return;

32164

}

32165

}

32166

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

32167

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

32168

isTypeLegal(MVT::v4i64)) {

32169

// Input needs to be split and output needs to widened. Let's use two

32170

// VTRUNCs, and shuffle their results together into the wider type.

32171

SDValue Lo, Hi;

32172

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

32173

32174

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

32175

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

32176

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

32177

{ 0, 1, 2, 3, 16, 17, 18, 19,

32178

-1, -1, -1, -1, -1, -1, -1, -1 });

32179

Results.push_back(Res);

32180

return;

32181

}

32182

32183

return;

32184

}

32185

case ISD::ANY_EXTEND:

32186

// Right now, only MVT::v8i8 has Custom action for an illegal type.

32187

// It's intended to custom handle the input type.

32188

assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__))

32189

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__));

32190

return;

32191

case ISD::SIGN_EXTEND:

32192

case ISD::ZERO_EXTEND: {

32193

EVT VT = N->getValueType(0);

32194

SDValue In = N->getOperand(0);

32195

EVT InVT = In.getValueType();

32196

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

32197

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

32198

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32199, __extension__
__PRETTY_FUNCTION__))

32199

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32199, __extension__
__PRETTY_FUNCTION__));

32200

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32200, __extension__
__PRETTY_FUNCTION__));

32201

// Custom split this so we can extend i8/i16->i32 invec. This is better

32202

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

32203

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

32204

// we allow the sra from the extend to i32 to be shared by the split.

32205

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

32206

32207

// Fill a vector with sign bits for each element.

32208

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

32209

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

32210

32211

// Create an unpackl and unpackh to interleave the sign bits then bitcast

32212

// to v2i64.

32213

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

32214

{0, 4, 1, 5});

32215

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

32216

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

32217

{2, 6, 3, 7});

32218

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

32219

32220

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

32221

Results.push_back(Res);

32222

return;

32223

}

32224

32225

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

32226

if (!InVT.is128BitVector()) {

32227

// Not a 128 bit vector, but maybe type legalization will promote

32228

// it to 128 bits.

32229

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

32230

return;

32231

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

32232

if (!InVT.is128BitVector())

32233

return;

32234

32235

// Promote the input to 128 bits. Type legalization will turn this into

32236

// zext_inreg/sext_inreg.

32237

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

32238

}

32239

32240

// Perform custom splitting instead of the two stage extend we would get

32241

// by default.

32242

EVT LoVT, HiVT;

32243

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

32244

assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32244, __extension__
__PRETTY_FUNCTION__));

32245

32246

SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);

32247

32248

// We need to shift the input over by half the number of elements.

32249

unsigned NumElts = InVT.getVectorNumElements();

32250

unsigned HalfNumElts = NumElts / 2;

32251

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

32252

for (unsigned i = 0; i != HalfNumElts; ++i)

32253

ShufMask[i] = i + HalfNumElts;

32254

32255

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

32256

Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);

32257

32258

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

32259

Results.push_back(Res);

32260

}

32261

return;

32262

}

32263

case ISD::FP_TO_SINT:

32264

case ISD::STRICT_FP_TO_SINT:

32265

case ISD::FP_TO_UINT:

32266

case ISD::STRICT_FP_TO_UINT: {

32267

bool IsStrict = N->isStrictFPOpcode();

32268

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

32269

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

32270

EVT VT = N->getValueType(0);

32271

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

32272

EVT SrcVT = Src.getValueType();

32273

32274

if (VT.isVector() && Subtarget.hasFP16() &&

32275

SrcVT.getVectorElementType() == MVT::f16) {

32276

EVT EleVT = VT.getVectorElementType();

32277

EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

32278

32279

if (SrcVT != MVT::v8f16) {

32280

SDValue Tmp =

32281

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

32282

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

32283

Ops[0] = Src;

32284

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

32285

}

32286

32287

SDValue Res, Chain;

32288

if (IsStrict) {

32289

unsigned Opc =

32290

IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

32291

Res =

32292

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

32293

Chain = Res.getValue(1);

32294

} else {

32295

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

32296

Res = DAG.getNode(Opc, dl, ResVT, Src);

32297

}

32298

32299

// TODO: Need to add exception check code for strict FP.

32300

if (EleVT.getSizeInBits() < 16) {

32301

MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

32302

Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);

32303

32304

// Now widen to 128 bits.

32305

unsigned NumConcats = 128 / TmpVT.getSizeInBits();

32306

MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

32307

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

32308

ConcatOps[0] = Res;

32309

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

32310

}

32311

32312

Results.push_back(Res);

32313

if (IsStrict)

32314

Results.push_back(Chain);

32315

32316

return;

32317

}

32318

32319

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

32320

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__))

32321

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__));

32322

32323

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

32324

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

32325

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

32326

VT.getVectorNumElements());

32327

SDValue Res;

32328

SDValue Chain;

32329

if (IsStrict) {

32330

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

32331

{N->getOperand(0), Src});

32332

Chain = Res.getValue(1);

32333

} else

32334

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

32335

32336

// Preserve what we know about the size of the original result. If the

32337

// result is v2i32, we have to manually widen the assert.

32338

if (PromoteVT == MVT::v2i32)

32339

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

32340

DAG.getUNDEF(MVT::v2i32));

32341

32342

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

32343

Res.getValueType(), Res,

32344

DAG.getValueType(VT.getVectorElementType()));

32345

32346

if (PromoteVT == MVT::v2i32)

32347

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

32348

DAG.getIntPtrConstant(0, dl));

32349

32350

// Truncate back to the original width.

32351

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

32352

32353

// Now widen to 128 bits.

32354

unsigned NumConcats = 128 / VT.getSizeInBits();

32355

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

32356

VT.getVectorNumElements() * NumConcats);

32357

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

32358

ConcatOps[0] = Res;

32359

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

32360

Results.push_back(Res);

32361

if (IsStrict)

32362

Results.push_back(Chain);

32363

return;

32364

}

32365

32366

32367

if (VT == MVT::v2i32) {

32368

assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32369, __extension__
__PRETTY_FUNCTION__))

32369

"Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32369, __extension__
__PRETTY_FUNCTION__));

32370

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32370, __extension__
__PRETTY_FUNCTION__));

32371

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32372, __extension__
__PRETTY_FUNCTION__))

32372

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32372, __extension__
__PRETTY_FUNCTION__));

32373

if (Src.getValueType() == MVT::v2f64) {

32374

if (!IsSigned && !Subtarget.hasAVX512()) {

32375

SDValue Res =

32376

expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

32377

Results.push_back(Res);

32378

return;

32379

}

32380

32381

unsigned Opc;

32382

if (IsStrict)

32383

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

32384

else

32385

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

32386

32387

// If we have VLX we can emit a target specific FP_TO_UINT node,.

32388

if (!IsSigned && !Subtarget.hasVLX()) {

32389

// Otherwise we can defer to the generic legalizer which will widen

32390

// the input as well. This will be further widened during op

32391

// legalization to v8i32<-v8f64.

32392

// For strict nodes we'll need to widen ourselves.

32393

// FIXME: Fix the type legalizer to safely widen strict nodes?

32394

if (!IsStrict)

32395

return;

32396

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

32397

DAG.getConstantFP(0.0, dl, MVT::v2f64));

32398

Opc = N->getOpcode();

32399

}

32400

SDValue Res;

32401

SDValue Chain;

32402

if (IsStrict) {

32403

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

32404

{N->getOperand(0), Src});

32405

Chain = Res.getValue(1);

32406

} else {

32407

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

32408

}

32409

Results.push_back(Res);

32410

if (IsStrict)

32411

Results.push_back(Chain);

32412

return;

32413

}

32414

32415

// Custom widen strict v2f32->v2i32 by padding with zeros.

32416

// FIXME: Should generic type legalizer do this?

32417

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

32418

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

32419

DAG.getConstantFP(0.0, dl, MVT::v2f32));

32420

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

32421

{N->getOperand(0), Src});

32422

Results.push_back(Res);

32423

Results.push_back(Res.getValue(1));

32424

return;

32425

}

32426

32427

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

32428

// so early out here.

32429

return;

32430

}

32431

32432

assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32432, __extension__
__PRETTY_FUNCTION__));

32433

32434

if ((Subtarget.hasDQI() && VT == MVT::i64 &&

32435

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

32436

(Subtarget.hasFP16() && SrcVT == MVT::f16)) {

32437

assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32437, __extension__
__PRETTY_FUNCTION__));

32438

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

32439

// If we use a 128-bit result we might need to use a target specific node.

32440

unsigned SrcElts =

32441

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

32442

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

32443

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

32444

unsigned Opc = N->getOpcode();

32445

if (NumElts != SrcElts) {

32446

if (IsStrict)

32447

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

32448

else

32449

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

32450

}

32451

32452

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

32453

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

32454

DAG.getConstantFP(0.0, dl, VecInVT), Src,

32455

ZeroIdx);

32456

SDValue Chain;

32457

if (IsStrict) {

32458

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

32459

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

32460

Chain = Res.getValue(1);

32461

} else

32462

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

32463

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

32464

Results.push_back(Res);

32465

if (IsStrict)

32466

Results.push_back(Chain);

32467

return;

32468

}

32469

32470

if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

32471

SDValue Chain;

32472

SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

32473

Results.push_back(V);

32474

if (IsStrict)

32475

Results.push_back(Chain);

32476

return;

32477

}

32478

32479

SDValue Chain;

32480

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

32481

Results.push_back(V);

32482

if (IsStrict)

32483

Results.push_back(Chain);

32484

}

32485

return;

32486

}

32487

case ISD::LRINT:

32488

case ISD::LLRINT: {

32489

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

32490

Results.push_back(V);

32491

return;

32492

}

32493

32494

case ISD::SINT_TO_FP:

32495

case ISD::STRICT_SINT_TO_FP:

32496

case ISD::UINT_TO_FP:

32497

case ISD::STRICT_UINT_TO_FP: {

32498

bool IsStrict = N->isStrictFPOpcode();

32499

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

32500

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

32501

EVT VT = N->getValueType(0);

32502

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

32503

if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

32504

Subtarget.hasVLX()) {

32505

if (Src.getValueType().getVectorElementType() == MVT::i16)

32506

return;

32507

32508

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

32509

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

32510

IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

32511

: DAG.getUNDEF(MVT::v2i32));

32512

if (IsStrict) {

32513

unsigned Opc =

32514

IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

32515

SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

32516

{N->getOperand(0), Src});

32517

Results.push_back(Res);

32518

Results.push_back(Res.getValue(1));

32519

} else {

32520

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

32521

Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

32522

}

32523

return;

32524

}

32525

if (VT != MVT::v2f32)

32526

return;

32527

EVT SrcVT = Src.getValueType();

32528

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

32529

if (IsStrict) {

32530

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

32531

: X86ISD::STRICT_CVTUI2P;

32532

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

32533

{N->getOperand(0), Src});

32534

Results.push_back(Res);

32535

Results.push_back(Res.getValue(1));

32536

} else {

32537

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

32538

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

32539

}

32540

return;

32541

}

32542

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

32543

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

32544

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

32545

SDValue One = DAG.getConstant(1, dl, SrcVT);

32546

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

32547

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

32548

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

32549

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

32550

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

32551

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

32552

for (int i = 0; i != 2; ++i) {

32553

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

32554

SignSrc, DAG.getIntPtrConstant(i, dl));

32555

if (IsStrict)

32556

SignCvts[i] =

32557

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

32558

{N->getOperand(0), Elt});

32559

else

32560

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

32561

};

32562

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

32563

SDValue Slow, Chain;

32564

if (IsStrict) {

32565

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

32566

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

32567

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

32568

{Chain, SignCvt, SignCvt});

32569

Chain = Slow.getValue(1);

32570

} else {

32571

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

32572

}

32573

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

32574

IsNeg =

32575

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

32576

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

32577

Results.push_back(Cvt);

32578

if (IsStrict)

32579

Results.push_back(Chain);

32580

return;

32581

}

32582

32583

if (SrcVT != MVT::v2i32)

32584

return;

32585

32586

if (IsSigned || Subtarget.hasAVX512()) {

32587

if (!IsStrict)

32588

return;

32589

32590

// Custom widen strict v2i32->v2f32 to avoid scalarization.

32591

// FIXME: Should generic type legalizer do this?

32592

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

32593

DAG.getConstant(0, dl, MVT::v2i32));

32594

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

32595

{N->getOperand(0), Src});

32596

Results.push_back(Res);

32597

Results.push_back(Res.getValue(1));

32598

return;

32599

}

32600

32601

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32601, __extension__
__PRETTY_FUNCTION__));

32602

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

32603

SDValue VBias =

32604

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);

32605

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

32606

DAG.getBitcast(MVT::v2i64, VBias));

32607

Or = DAG.getBitcast(MVT::v2f64, Or);

32608

if (IsStrict) {

32609

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

32610

{N->getOperand(0), Or, VBias});

32611

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

32612

{MVT::v4f32, MVT::Other},

32613

{Sub.getValue(1), Sub});

32614

Results.push_back(Res);

32615

Results.push_back(Res.getValue(1));

32616

} else {

32617

// TODO: Are there any fast-math-flags to propagate here?

32618

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

32619

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

32620

}

32621

return;

32622

}

32623

case ISD::STRICT_FP_ROUND:

32624

case ISD::FP_ROUND: {

32625

bool IsStrict = N->isStrictFPOpcode();

32626

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

32627

EVT VT = N->getValueType(0);

32628

EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

32629

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

32630

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

32631

: DAG.getUNDEF(MVT::v2f32);

32632

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

32633

}

32634

if (!isTypeLegal(Src.getValueType()))

32635

return;

32636

SDValue V;

32637

if (IsStrict)

32638

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

32639

{N->getOperand(0), Src});

32640

else

32641

V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

32642

Results.push_back(V);

32643

if (IsStrict)

32644

Results.push_back(V.getValue(1));

32645

return;

32646

}

32647

case ISD::FP_EXTEND:

32648

case ISD::STRICT_FP_EXTEND: {

32649

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

32650

// No other ValueType for FP_EXTEND should reach this point.

32651

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32652, __extension__
__PRETTY_FUNCTION__))

32652

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32652, __extension__
__PRETTY_FUNCTION__));

32653

if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

32654

return;

32655

bool IsStrict = N->isStrictFPOpcode();

32656

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

32657

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

32658

: DAG.getUNDEF(MVT::v2f16);

32659

SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

32660

if (IsStrict)

32661

V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

32662

{N->getOperand(0), V});

32663

else

32664

V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

32665

Results.push_back(V);

32666

if (IsStrict)

32667

Results.push_back(V.getValue(1));

32668

return;

32669

}

32670

case ISD::INTRINSIC_W_CHAIN: {

32671

unsigned IntNo = N->getConstantOperandVal(1);

32672

switch (IntNo) {

32673

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32674)

32674

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32674);

32675

case Intrinsic::x86_rdtsc:

32676

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

32677

Results);

32678

case Intrinsic::x86_rdtscp:

32679

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

32680

Results);

32681

case Intrinsic::x86_rdpmc:

32682

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

32683

Results);

32684

return;

32685

case Intrinsic::x86_xgetbv:

32686

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

32687

Results);

32688

return;

32689

}

32690

}

32691

case ISD::READCYCLECOUNTER: {

32692

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

32693

}

32694

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

32695

EVT T = N->getValueType(0);

32696

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32696, __extension__
__PRETTY_FUNCTION__));

32697

bool Regs64bit = T == MVT::i128;

32698

assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32699, __extension__
__PRETTY_FUNCTION__))

32699

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32699, __extension__
__PRETTY_FUNCTION__));

32700

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

32701

SDValue cpInL, cpInH;

32702

cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

32703

DAG.getConstant(0, dl, HalfT));

32704

cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

32705

DAG.getConstant(1, dl, HalfT));

32706

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

32707

Regs64bit ? X86::RAX : X86::EAX,

32708

cpInL, SDValue());

32709

cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,

32710

Regs64bit ? X86::RDX : X86::EDX,

32711

cpInH, cpInL.getValue(1));

32712

SDValue swapInL, swapInH;

32713

swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

32714

DAG.getConstant(0, dl, HalfT));

32715

swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

32716

DAG.getConstant(1, dl, HalfT));

32717

swapInH =

32718

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

32719

swapInH, cpInH.getValue(1));

32720

32721

// In 64-bit mode we might need the base pointer in RBX, but we can't know

32722

// until later. So we keep the RBX input in a vreg and use a custom

32723

// inserter.

32724

// Since RBX will be a reserved register the register allocator will not

32725

// make sure its value will be properly saved and restored around this

32726

// live-range.

32727

SDValue Result;

32728

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

32729

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

32730

if (Regs64bit) {

32731

SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

32732

swapInH.getValue(1)};

32733

Result =

32734

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

32735

} else {

32736

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

32737

swapInH.getValue(1));

32738

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

32739

swapInL.getValue(1)};

32740

Result =

32741

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

32742

}

32743

32744

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

32745

Regs64bit ? X86::RAX : X86::EAX,

32746

HalfT, Result.getValue(1));

32747

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

32748

Regs64bit ? X86::RDX : X86::EDX,

32749

HalfT, cpOutL.getValue(2));

32750

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

32751

32752

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

32753

MVT::i32, cpOutH.getValue(2));

32754

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

32755

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

32756

32757

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

32758

Results.push_back(Success);

32759

Results.push_back(EFLAGS.getValue(1));

32760

return;

32761

}

32762

case ISD::ATOMIC_LOAD: {

32763

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32763, __extension__
__PRETTY_FUNCTION__));

32764

bool NoImplicitFloatOps =

32765

DAG.getMachineFunction().getFunction().hasFnAttribute(

32766

Attribute::NoImplicitFloat);

32767

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

32768

auto *Node = cast<AtomicSDNode>(N);

32769

if (Subtarget.hasSSE1()) {

32770

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

32771

// Then extract the lower 64-bits.

32772

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

32773

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

32774

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

32775

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

32776

MVT::i64, Node->getMemOperand());

32777

if (Subtarget.hasSSE2()) {

32778

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

32779

DAG.getIntPtrConstant(0, dl));

32780

Results.push_back(Res);

32781

Results.push_back(Ld.getValue(1));

32782

return;

32783

}

32784

// We use an alternative sequence for SSE1 that extracts as v2f32 and

32785

// then casts to i64. This avoids a 128-bit stack temporary being

32786

// created by type legalization if we were to cast v4f32->v2i64.

32787

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

32788

DAG.getIntPtrConstant(0, dl));

32789

Res = DAG.getBitcast(MVT::i64, Res);

32790

Results.push_back(Res);

32791

Results.push_back(Ld.getValue(1));

32792

return;

32793

}

32794

if (Subtarget.hasX87()) {

32795

// First load this into an 80-bit X87 register. This will put the whole

32796

// integer into the significand.

32797

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

32798

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

32799

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

32800

dl, Tys, Ops, MVT::i64,

32801

Node->getMemOperand());

32802

SDValue Chain = Result.getValue(1);

32803

32804

// Now store the X87 register to a stack temporary and convert to i64.

32805

// This store is not atomic and doesn't need to be.

32806

// FIXME: We don't need a stack temporary if the result of the load

32807

// is already being stored. We could just directly store there.

32808

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

32809

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

32810

MachinePointerInfo MPI =

32811

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

32812

SDValue StoreOps[] = { Chain, Result, StackPtr };

32813

Chain = DAG.getMemIntrinsicNode(

32814

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

32815

MPI, None /*Align*/, MachineMemOperand::MOStore);

32816

32817

// Finally load the value back from the stack temporary and return it.

32818

// This load is not atomic and doesn't need to be.

32819

// This load will be further type legalized.

32820

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

32821

Results.push_back(Result);

32822

Results.push_back(Result.getValue(1));

32823

return;

32824

}

32825

}

32826

// TODO: Use MOVLPS when SSE1 is available?

32827

// Delegate to generic TypeLegalization. Situations we can really handle

32828

// should have already been dealt with by AtomicExpandPass.cpp.

32829

break;

32830

}

32831

case ISD::ATOMIC_SWAP:

32832

case ISD::ATOMIC_LOAD_ADD:

32833

case ISD::ATOMIC_LOAD_SUB:

32834

case ISD::ATOMIC_LOAD_AND:

32835

case ISD::ATOMIC_LOAD_OR:

32836

case ISD::ATOMIC_LOAD_XOR:

32837

case ISD::ATOMIC_LOAD_NAND:

32838

case ISD::ATOMIC_LOAD_MIN:

32839

case ISD::ATOMIC_LOAD_MAX:

32840

case ISD::ATOMIC_LOAD_UMIN:

32841

case ISD::ATOMIC_LOAD_UMAX:

32842

// Delegate to generic TypeLegalization. Situations we can really handle

32843

// should have already been dealt with by AtomicExpandPass.cpp.

32844

break;

32845

32846

case ISD::BITCAST: {

32847

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32847, __extension__
__PRETTY_FUNCTION__));

32848

EVT DstVT = N->getValueType(0);

32849

EVT SrcVT = N->getOperand(0).getValueType();

32850

32851

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

32852

// we can split using the k-register rather than memory.

32853

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

32854

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32854, __extension__
__PRETTY_FUNCTION__));

32855

SDValue Lo, Hi;

32856

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

32857

Lo = DAG.getBitcast(MVT::i32, Lo);

32858

Hi = DAG.getBitcast(MVT::i32, Hi);

32859

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

32860

Results.push_back(Res);

32861

return;

32862

}

32863

32864

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

32865

// FIXME: Use v4f32 for SSE1?

32866

assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32866, __extension__
__PRETTY_FUNCTION__));

32867

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32868, __extension__
__PRETTY_FUNCTION__))

32868

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32868, __extension__
__PRETTY_FUNCTION__));

32869

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

32870

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

32871

N->getOperand(0));

32872

Res = DAG.getBitcast(WideVT, Res);

32873

Results.push_back(Res);

32874

return;

32875

}

32876

32877

return;

32878

}

32879

case ISD::MGATHER: {

32880

EVT VT = N->getValueType(0);

32881

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

32882

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

32883

auto *Gather = cast<MaskedGatherSDNode>(N);

32884

SDValue Index = Gather->getIndex();

32885

if (Index.getValueType() != MVT::v2i64)

32886

return;

32887

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32888, __extension__
__PRETTY_FUNCTION__))

32888

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32888, __extension__
__PRETTY_FUNCTION__));

32889

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

32890

SDValue Mask = Gather->getMask();

32891

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32891, __extension__
__PRETTY_FUNCTION__));

32892

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

32893

Gather->getPassThru(),

32894

DAG.getUNDEF(VT));

32895

if (!Subtarget.hasVLX()) {

32896

// We need to widen the mask, but the instruction will only use 2

32897

// of its elements. So we can use undef.

32898

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

32899

DAG.getUNDEF(MVT::v2i1));

32900

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

32901

}

32902

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

32903

Gather->getBasePtr(), Index, Gather->getScale() };

32904

SDValue Res = DAG.getMemIntrinsicNode(

32905

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

32906

Gather->getMemoryVT(), Gather->getMemOperand());

32907

Results.push_back(Res);

32908

Results.push_back(Res.getValue(1));

32909

return;

32910

}

32911

return;

32912

}

32913

case ISD::LOAD: {

32914

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

32915

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

32916

// cast since type legalization will try to use an i64 load.

32917

MVT VT = N->getSimpleValueType(0);

32918

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32918, __extension__
__PRETTY_FUNCTION__));

32919

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__))

32920

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__));

32921

if (!ISD::isNON_EXTLoad(N))

32922

return;

32923

auto *Ld = cast<LoadSDNode>(N);

32924

if (Subtarget.hasSSE2()) {

32925

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

32926

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

32927

Ld->getPointerInfo(), Ld->getOriginalAlign(),

32928

Ld->getMemOperand()->getFlags());

32929

SDValue Chain = Res.getValue(1);

32930

MVT VecVT = MVT::getVectorVT(LdVT, 2);

32931

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

32932

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

32933

Res = DAG.getBitcast(WideVT, Res);

32934

Results.push_back(Res);

32935

Results.push_back(Chain);

32936

return;

32937

}

32938

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32938, __extension__
__PRETTY_FUNCTION__));

32939

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

32940

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

32941

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

32942

MVT::i64, Ld->getMemOperand());

32943

Results.push_back(Res);

32944

Results.push_back(Res.getValue(1));

32945

return;

32946

}

32947

case ISD::ADDRSPACECAST: {

32948

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

32949

Results.push_back(V);

32950

return;

32951

}

32952

case ISD::BITREVERSE:

32953

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__));

32954

assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32954, __extension__
__PRETTY_FUNCTION__));

32955

// We can use VPPERM by copying to a vector register and back. We'll need

32956

// to move the scalar in two i32 pieces.

32957

Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

32958

return;

32959

}

32960

}

32961

32962

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

32963

switch ((X86ISD::NodeType)Opcode) {

32964

case X86ISD::FIRST_NUMBER: break;

32965

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

32966

NODE_NAME_CASE(BSF)

32967

NODE_NAME_CASE(BSR)

32968

NODE_NAME_CASE(FSHL)

32969

NODE_NAME_CASE(FSHR)

32970

NODE_NAME_CASE(FAND)

32971

NODE_NAME_CASE(FANDN)

32972

NODE_NAME_CASE(FOR)

32973

NODE_NAME_CASE(FXOR)

32974

NODE_NAME_CASE(FILD)

32975

NODE_NAME_CASE(FIST)

32976

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

32977

NODE_NAME_CASE(FLD)

32978

NODE_NAME_CASE(FST)

32979

NODE_NAME_CASE(CALL)

32980

NODE_NAME_CASE(CALL_RVMARKER)

32981

NODE_NAME_CASE(BT)

32982

NODE_NAME_CASE(CMP)

32983

NODE_NAME_CASE(FCMP)

32984

NODE_NAME_CASE(STRICT_FCMP)

32985

NODE_NAME_CASE(STRICT_FCMPS)

32986

NODE_NAME_CASE(COMI)

32987

NODE_NAME_CASE(UCOMI)

32988

NODE_NAME_CASE(CMPM)

32989

NODE_NAME_CASE(CMPMM)

32990

NODE_NAME_CASE(STRICT_CMPM)

32991

NODE_NAME_CASE(CMPMM_SAE)

32992

NODE_NAME_CASE(SETCC)

32993

NODE_NAME_CASE(SETCC_CARRY)

32994

NODE_NAME_CASE(FSETCC)

32995

NODE_NAME_CASE(FSETCCM)

32996

NODE_NAME_CASE(FSETCCM_SAE)

32997

NODE_NAME_CASE(CMOV)

32998

NODE_NAME_CASE(BRCOND)

32999

NODE_NAME_CASE(RET_FLAG)

33000

NODE_NAME_CASE(IRET)

33001

NODE_NAME_CASE(REP_STOS)

33002

NODE_NAME_CASE(REP_MOVS)

33003

NODE_NAME_CASE(GlobalBaseReg)

33004

NODE_NAME_CASE(Wrapper)

33005

NODE_NAME_CASE(WrapperRIP)

33006

NODE_NAME_CASE(MOVQ2DQ)

33007

NODE_NAME_CASE(MOVDQ2Q)

33008

NODE_NAME_CASE(MMX_MOVD2W)

33009

NODE_NAME_CASE(MMX_MOVW2D)

33010

NODE_NAME_CASE(PEXTRB)

33011

NODE_NAME_CASE(PEXTRW)

33012

NODE_NAME_CASE(INSERTPS)

33013

NODE_NAME_CASE(PINSRB)

33014

NODE_NAME_CASE(PINSRW)

33015

NODE_NAME_CASE(PSHUFB)

33016

NODE_NAME_CASE(ANDNP)

33017

NODE_NAME_CASE(BLENDI)

33018

NODE_NAME_CASE(BLENDV)

33019

NODE_NAME_CASE(HADD)

33020

NODE_NAME_CASE(HSUB)

33021

NODE_NAME_CASE(FHADD)

33022

NODE_NAME_CASE(FHSUB)

33023

NODE_NAME_CASE(CONFLICT)

33024

NODE_NAME_CASE(FMAX)

33025

NODE_NAME_CASE(FMAXS)

33026

NODE_NAME_CASE(FMAX_SAE)

33027

NODE_NAME_CASE(FMAXS_SAE)

33028

NODE_NAME_CASE(FMIN)

33029

NODE_NAME_CASE(FMINS)

33030

NODE_NAME_CASE(FMIN_SAE)

33031

NODE_NAME_CASE(FMINS_SAE)

33032

NODE_NAME_CASE(FMAXC)

33033

NODE_NAME_CASE(FMINC)

33034

NODE_NAME_CASE(FRSQRT)

33035

NODE_NAME_CASE(FRCP)

33036

NODE_NAME_CASE(EXTRQI)

33037

NODE_NAME_CASE(INSERTQI)

33038

NODE_NAME_CASE(TLSADDR)

33039

NODE_NAME_CASE(TLSBASEADDR)

33040

NODE_NAME_CASE(TLSCALL)

33041

NODE_NAME_CASE(EH_SJLJ_SETJMP)

33042

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

33043

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

33044

NODE_NAME_CASE(EH_RETURN)

33045

NODE_NAME_CASE(TC_RETURN)

33046

NODE_NAME_CASE(FNSTCW16m)

33047

NODE_NAME_CASE(FLDCW16m)

33048

NODE_NAME_CASE(LCMPXCHG_DAG)

33049

NODE_NAME_CASE(LCMPXCHG8_DAG)

33050

NODE_NAME_CASE(LCMPXCHG16_DAG)

33051

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

33052

NODE_NAME_CASE(LADD)

33053

NODE_NAME_CASE(LSUB)

33054

NODE_NAME_CASE(LOR)

33055

NODE_NAME_CASE(LXOR)

33056

NODE_NAME_CASE(LAND)

33057

NODE_NAME_CASE(LBTS)

33058

NODE_NAME_CASE(LBTC)

33059

NODE_NAME_CASE(LBTR)

33060

NODE_NAME_CASE(VZEXT_MOVL)

33061

NODE_NAME_CASE(VZEXT_LOAD)

33062

NODE_NAME_CASE(VEXTRACT_STORE)

33063

NODE_NAME_CASE(VTRUNC)

33064

NODE_NAME_CASE(VTRUNCS)

33065

NODE_NAME_CASE(VTRUNCUS)

33066

NODE_NAME_CASE(VMTRUNC)

33067

NODE_NAME_CASE(VMTRUNCS)

33068

NODE_NAME_CASE(VMTRUNCUS)

33069

NODE_NAME_CASE(VTRUNCSTORES)

33070

NODE_NAME_CASE(VTRUNCSTOREUS)

33071

NODE_NAME_CASE(VMTRUNCSTORES)

33072

NODE_NAME_CASE(VMTRUNCSTOREUS)

33073

NODE_NAME_CASE(VFPEXT)

33074

NODE_NAME_CASE(STRICT_VFPEXT)

33075

NODE_NAME_CASE(VFPEXT_SAE)

33076

NODE_NAME_CASE(VFPEXTS)

33077

NODE_NAME_CASE(VFPEXTS_SAE)

33078

NODE_NAME_CASE(VFPROUND)

33079

NODE_NAME_CASE(STRICT_VFPROUND)

33080

NODE_NAME_CASE(VMFPROUND)

33081

NODE_NAME_CASE(VFPROUND_RND)

33082

NODE_NAME_CASE(VFPROUNDS)

33083

NODE_NAME_CASE(VFPROUNDS_RND)

33084

NODE_NAME_CASE(VSHLDQ)

33085

NODE_NAME_CASE(VSRLDQ)

33086

NODE_NAME_CASE(VSHL)

33087

NODE_NAME_CASE(VSRL)

33088

NODE_NAME_CASE(VSRA)

33089

NODE_NAME_CASE(VSHLI)

33090

NODE_NAME_CASE(VSRLI)

33091

NODE_NAME_CASE(VSRAI)

33092

NODE_NAME_CASE(VSHLV)

33093

NODE_NAME_CASE(VSRLV)

33094

NODE_NAME_CASE(VSRAV)

33095

NODE_NAME_CASE(VROTLI)

33096

NODE_NAME_CASE(VROTRI)

33097

NODE_NAME_CASE(VPPERM)

33098

NODE_NAME_CASE(CMPP)

33099

NODE_NAME_CASE(STRICT_CMPP)

33100

NODE_NAME_CASE(PCMPEQ)

33101

NODE_NAME_CASE(PCMPGT)

33102

NODE_NAME_CASE(PHMINPOS)

33103

NODE_NAME_CASE(ADD)

33104

NODE_NAME_CASE(SUB)

33105

NODE_NAME_CASE(ADC)

33106

NODE_NAME_CASE(SBB)

33107

NODE_NAME_CASE(SMUL)

33108

NODE_NAME_CASE(UMUL)

33109

NODE_NAME_CASE(OR)

33110

NODE_NAME_CASE(XOR)

33111

NODE_NAME_CASE(AND)

33112

NODE_NAME_CASE(BEXTR)

33113

NODE_NAME_CASE(BEXTRI)

33114

NODE_NAME_CASE(BZHI)

33115

NODE_NAME_CASE(PDEP)

33116

NODE_NAME_CASE(PEXT)

33117

NODE_NAME_CASE(MUL_IMM)

33118

NODE_NAME_CASE(MOVMSK)

33119

NODE_NAME_CASE(PTEST)

33120

NODE_NAME_CASE(TESTP)

33121

NODE_NAME_CASE(KORTEST)

33122

NODE_NAME_CASE(KTEST)

33123

NODE_NAME_CASE(KADD)

33124

NODE_NAME_CASE(KSHIFTL)

33125

NODE_NAME_CASE(KSHIFTR)

33126

NODE_NAME_CASE(PACKSS)

33127

NODE_NAME_CASE(PACKUS)

33128

NODE_NAME_CASE(PALIGNR)

33129

NODE_NAME_CASE(VALIGN)

33130

NODE_NAME_CASE(VSHLD)

33131

NODE_NAME_CASE(VSHRD)

33132

NODE_NAME_CASE(VSHLDV)

33133

NODE_NAME_CASE(VSHRDV)

33134

NODE_NAME_CASE(PSHUFD)

33135

NODE_NAME_CASE(PSHUFHW)

33136

NODE_NAME_CASE(PSHUFLW)

33137

NODE_NAME_CASE(SHUFP)

33138

NODE_NAME_CASE(SHUF128)

33139

NODE_NAME_CASE(MOVLHPS)

33140

NODE_NAME_CASE(MOVHLPS)

33141

NODE_NAME_CASE(MOVDDUP)

33142

NODE_NAME_CASE(MOVSHDUP)

33143

NODE_NAME_CASE(MOVSLDUP)

33144

NODE_NAME_CASE(MOVSD)

33145

NODE_NAME_CASE(MOVSS)

33146

NODE_NAME_CASE(MOVSH)

33147

NODE_NAME_CASE(UNPCKL)

33148

NODE_NAME_CASE(UNPCKH)

33149

NODE_NAME_CASE(VBROADCAST)

33150

NODE_NAME_CASE(VBROADCAST_LOAD)

33151

NODE_NAME_CASE(VBROADCASTM)

33152

NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

33153

NODE_NAME_CASE(VPERMILPV)

33154

NODE_NAME_CASE(VPERMILPI)

33155

NODE_NAME_CASE(VPERM2X128)

33156

NODE_NAME_CASE(VPERMV)

33157

NODE_NAME_CASE(VPERMV3)

33158

NODE_NAME_CASE(VPERMI)

33159

NODE_NAME_CASE(VPTERNLOG)

33160

NODE_NAME_CASE(VFIXUPIMM)

33161

NODE_NAME_CASE(VFIXUPIMM_SAE)

33162

NODE_NAME_CASE(VFIXUPIMMS)

33163

NODE_NAME_CASE(VFIXUPIMMS_SAE)

33164

NODE_NAME_CASE(VRANGE)

33165

NODE_NAME_CASE(VRANGE_SAE)

33166

NODE_NAME_CASE(VRANGES)

33167

NODE_NAME_CASE(VRANGES_SAE)

33168

NODE_NAME_CASE(PMULUDQ)

33169

NODE_NAME_CASE(PMULDQ)

33170

NODE_NAME_CASE(PSADBW)

33171

NODE_NAME_CASE(DBPSADBW)

33172

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

33173

NODE_NAME_CASE(VAARG_64)

33174

NODE_NAME_CASE(VAARG_X32)

33175

NODE_NAME_CASE(DYN_ALLOCA)

33176

NODE_NAME_CASE(MEMBARRIER)

33177

NODE_NAME_CASE(MFENCE)

33178

NODE_NAME_CASE(SEG_ALLOCA)

33179

NODE_NAME_CASE(PROBED_ALLOCA)

33180

NODE_NAME_CASE(RDRAND)

33181

NODE_NAME_CASE(RDSEED)

33182

NODE_NAME_CASE(RDPKRU)

33183

NODE_NAME_CASE(WRPKRU)

33184

NODE_NAME_CASE(VPMADDUBSW)

33185

NODE_NAME_CASE(VPMADDWD)

33186

NODE_NAME_CASE(VPSHA)

33187

NODE_NAME_CASE(VPSHL)

33188

NODE_NAME_CASE(VPCOM)

33189

NODE_NAME_CASE(VPCOMU)

33190

NODE_NAME_CASE(VPERMIL2)

33191

NODE_NAME_CASE(FMSUB)

33192

NODE_NAME_CASE(STRICT_FMSUB)

33193

NODE_NAME_CASE(FNMADD)

33194

NODE_NAME_CASE(STRICT_FNMADD)

33195

NODE_NAME_CASE(FNMSUB)

33196

NODE_NAME_CASE(STRICT_FNMSUB)

33197

NODE_NAME_CASE(FMADDSUB)

33198

NODE_NAME_CASE(FMSUBADD)

33199

NODE_NAME_CASE(FMADD_RND)

33200

NODE_NAME_CASE(FNMADD_RND)

33201

NODE_NAME_CASE(FMSUB_RND)

33202

NODE_NAME_CASE(FNMSUB_RND)

33203

NODE_NAME_CASE(FMADDSUB_RND)

33204

NODE_NAME_CASE(FMSUBADD_RND)

33205

NODE_NAME_CASE(VFMADDC)

33206

NODE_NAME_CASE(VFMADDC_RND)

33207

NODE_NAME_CASE(VFCMADDC)

33208

NODE_NAME_CASE(VFCMADDC_RND)

33209

NODE_NAME_CASE(VFMULC)

33210

NODE_NAME_CASE(VFMULC_RND)

33211

NODE_NAME_CASE(VFCMULC)

33212

NODE_NAME_CASE(VFCMULC_RND)

33213

NODE_NAME_CASE(VFMULCSH)

33214

NODE_NAME_CASE(VFMULCSH_RND)

33215

NODE_NAME_CASE(VFCMULCSH)

33216

NODE_NAME_CASE(VFCMULCSH_RND)

33217

NODE_NAME_CASE(VFMADDCSH)

33218

NODE_NAME_CASE(VFMADDCSH_RND)

33219

NODE_NAME_CASE(VFCMADDCSH)

33220

NODE_NAME_CASE(VFCMADDCSH_RND)

33221

NODE_NAME_CASE(VPMADD52H)

33222

NODE_NAME_CASE(VPMADD52L)

33223

NODE_NAME_CASE(VRNDSCALE)

33224

NODE_NAME_CASE(STRICT_VRNDSCALE)

33225

NODE_NAME_CASE(VRNDSCALE_SAE)

33226

NODE_NAME_CASE(VRNDSCALES)

33227

NODE_NAME_CASE(VRNDSCALES_SAE)

33228

NODE_NAME_CASE(VREDUCE)

33229

NODE_NAME_CASE(VREDUCE_SAE)

33230

NODE_NAME_CASE(VREDUCES)

33231

NODE_NAME_CASE(VREDUCES_SAE)

33232

NODE_NAME_CASE(VGETMANT)

33233

NODE_NAME_CASE(VGETMANT_SAE)

33234

NODE_NAME_CASE(VGETMANTS)

33235

NODE_NAME_CASE(VGETMANTS_SAE)

33236

NODE_NAME_CASE(PCMPESTR)

33237

NODE_NAME_CASE(PCMPISTR)

33238

NODE_NAME_CASE(XTEST)

33239

NODE_NAME_CASE(COMPRESS)

33240

NODE_NAME_CASE(EXPAND)

33241

NODE_NAME_CASE(SELECTS)

33242

NODE_NAME_CASE(ADDSUB)

33243

NODE_NAME_CASE(RCP14)

33244

NODE_NAME_CASE(RCP14S)

33245

NODE_NAME_CASE(RCP28)

33246

NODE_NAME_CASE(RCP28_SAE)

33247

NODE_NAME_CASE(RCP28S)

33248

NODE_NAME_CASE(RCP28S_SAE)

33249

NODE_NAME_CASE(EXP2)

33250

NODE_NAME_CASE(EXP2_SAE)

33251

NODE_NAME_CASE(RSQRT14)

33252

NODE_NAME_CASE(RSQRT14S)

33253

NODE_NAME_CASE(RSQRT28)

33254

NODE_NAME_CASE(RSQRT28_SAE)

33255

NODE_NAME_CASE(RSQRT28S)

33256

NODE_NAME_CASE(RSQRT28S_SAE)

33257

NODE_NAME_CASE(FADD_RND)

33258

NODE_NAME_CASE(FADDS)

33259

NODE_NAME_CASE(FADDS_RND)

33260

NODE_NAME_CASE(FSUB_RND)

33261

NODE_NAME_CASE(FSUBS)

33262

NODE_NAME_CASE(FSUBS_RND)

33263

NODE_NAME_CASE(FMUL_RND)

33264

NODE_NAME_CASE(FMULS)

33265

NODE_NAME_CASE(FMULS_RND)

33266

NODE_NAME_CASE(FDIV_RND)

33267

NODE_NAME_CASE(FDIVS)

33268

NODE_NAME_CASE(FDIVS_RND)

33269

NODE_NAME_CASE(FSQRT_RND)

33270

NODE_NAME_CASE(FSQRTS)

33271

NODE_NAME_CASE(FSQRTS_RND)

33272

NODE_NAME_CASE(FGETEXP)

33273

NODE_NAME_CASE(FGETEXP_SAE)

33274

NODE_NAME_CASE(FGETEXPS)

33275

NODE_NAME_CASE(FGETEXPS_SAE)

33276

NODE_NAME_CASE(SCALEF)

33277

NODE_NAME_CASE(SCALEF_RND)

33278

NODE_NAME_CASE(SCALEFS)

33279

NODE_NAME_CASE(SCALEFS_RND)

33280

NODE_NAME_CASE(MULHRS)

33281

NODE_NAME_CASE(SINT_TO_FP_RND)

33282

NODE_NAME_CASE(UINT_TO_FP_RND)

33283

NODE_NAME_CASE(CVTTP2SI)

33284

NODE_NAME_CASE(CVTTP2UI)

33285

NODE_NAME_CASE(STRICT_CVTTP2SI)

33286

NODE_NAME_CASE(STRICT_CVTTP2UI)

33287

NODE_NAME_CASE(MCVTTP2SI)

33288

NODE_NAME_CASE(MCVTTP2UI)

33289

NODE_NAME_CASE(CVTTP2SI_SAE)

33290

NODE_NAME_CASE(CVTTP2UI_SAE)

33291

NODE_NAME_CASE(CVTTS2SI)

33292

NODE_NAME_CASE(CVTTS2UI)

33293

NODE_NAME_CASE(CVTTS2SI_SAE)

33294

NODE_NAME_CASE(CVTTS2UI_SAE)

33295

NODE_NAME_CASE(CVTSI2P)

33296

NODE_NAME_CASE(CVTUI2P)

33297

NODE_NAME_CASE(STRICT_CVTSI2P)

33298

NODE_NAME_CASE(STRICT_CVTUI2P)

33299

NODE_NAME_CASE(MCVTSI2P)

33300

NODE_NAME_CASE(MCVTUI2P)

33301

NODE_NAME_CASE(VFPCLASS)

33302

NODE_NAME_CASE(VFPCLASSS)

33303

NODE_NAME_CASE(MULTISHIFT)

33304

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

33305

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

33306

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

33307

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

33308

NODE_NAME_CASE(CVTPS2PH)

33309

NODE_NAME_CASE(STRICT_CVTPS2PH)

33310

NODE_NAME_CASE(MCVTPS2PH)

33311

NODE_NAME_CASE(CVTPH2PS)

33312

NODE_NAME_CASE(STRICT_CVTPH2PS)

33313

NODE_NAME_CASE(CVTPH2PS_SAE)

33314

NODE_NAME_CASE(CVTP2SI)

33315

NODE_NAME_CASE(CVTP2UI)

33316

NODE_NAME_CASE(MCVTP2SI)

33317

NODE_NAME_CASE(MCVTP2UI)

33318

NODE_NAME_CASE(CVTP2SI_RND)

33319

NODE_NAME_CASE(CVTP2UI_RND)

33320

NODE_NAME_CASE(CVTS2SI)

33321

NODE_NAME_CASE(CVTS2UI)

33322

NODE_NAME_CASE(CVTS2SI_RND)

33323

NODE_NAME_CASE(CVTS2UI_RND)

33324

NODE_NAME_CASE(CVTNE2PS2BF16)

33325

NODE_NAME_CASE(CVTNEPS2BF16)

33326

NODE_NAME_CASE(MCVTNEPS2BF16)

33327

NODE_NAME_CASE(DPBF16PS)

33328

NODE_NAME_CASE(LWPINS)

33329

NODE_NAME_CASE(MGATHER)

33330

NODE_NAME_CASE(MSCATTER)

33331

NODE_NAME_CASE(VPDPBUSD)

33332

NODE_NAME_CASE(VPDPBUSDS)

33333

NODE_NAME_CASE(VPDPWSSD)

33334

NODE_NAME_CASE(VPDPWSSDS)

33335

NODE_NAME_CASE(VPSHUFBITQMB)

33336

NODE_NAME_CASE(GF2P8MULB)

33337

NODE_NAME_CASE(GF2P8AFFINEQB)

33338

NODE_NAME_CASE(GF2P8AFFINEINVQB)

33339

NODE_NAME_CASE(NT_CALL)

33340

NODE_NAME_CASE(NT_BRIND)

33341

NODE_NAME_CASE(UMWAIT)

33342

NODE_NAME_CASE(TPAUSE)

33343

NODE_NAME_CASE(ENQCMD)

33344

NODE_NAME_CASE(ENQCMDS)

33345

NODE_NAME_CASE(VP2INTERSECT)

33346

NODE_NAME_CASE(AESENC128KL)

33347

NODE_NAME_CASE(AESDEC128KL)

33348

NODE_NAME_CASE(AESENC256KL)

33349

NODE_NAME_CASE(AESDEC256KL)

33350

NODE_NAME_CASE(AESENCWIDE128KL)

33351

NODE_NAME_CASE(AESDECWIDE128KL)

33352

NODE_NAME_CASE(AESENCWIDE256KL)

33353

NODE_NAME_CASE(AESDECWIDE256KL)

33354

NODE_NAME_CASE(TESTUI)

33355

}

33356

return nullptr;

33357

#undef NODE_NAME_CASE

33358

}

33359

33360

/// Return true if the addressing mode represented by AM is legal for this

33361

/// target, for a load/store of the specified type.

33362

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

33363

const AddrMode &AM, Type *Ty,

33364

unsigned AS,

33365

Instruction *I) const {

33366

// X86 supports extremely general addressing modes.

33367

CodeModel::Model M = getTargetMachine().getCodeModel();

33368

33369

// X86 allows a sign-extended 32-bit immediate field as a displacement.

33370

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

33371

return false;

33372

33373

if (AM.BaseGV) {

33374

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

33375

33376

// If a reference to this global requires an extra load, we can't fold it.

33377

if (isGlobalStubReference(GVFlags))

33378

return false;

33379

33380

// If BaseGV requires a register for the PIC base, we cannot also have a

33381

// BaseReg specified.

33382

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

33383

return false;

33384

33385

// If lower 4G is not available, then we must use rip-relative addressing.

33386

if ((M != CodeModel::Small || isPositionIndependent()) &&

33387

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

33388

return false;

33389

}

33390

33391

switch (AM.Scale) {

33392

case 0:

33393

case 1:

33394

case 2:

33395

case 4:

33396

case 8:

33397

// These scales always work.

33398

break;

33399

case 3:

33400

case 5:

33401

case 9:

33402

// These scales are formed with basereg+scalereg. Only accept if there is

33403

// no basereg yet.

33404

if (AM.HasBaseReg)

33405

return false;

33406

break;

33407

default: // Other stuff never works.

33408

return false;

33409

}

33410

33411

return true;

33412

}

33413

33414

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

33415

unsigned Bits = Ty->getScalarSizeInBits();

33416

33417

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

33418

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

33419

if (Subtarget.hasXOP() &&

33420

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

33421

return false;

33422

33423

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

33424

// shifts just as cheap as scalar ones.

33425

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

33426

return false;

33427

33428

// AVX512BW has shifts such as vpsllvw.

33429

if (Subtarget.hasBWI() && Bits == 16)

33430

return false;

33431

33432

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

33433

// fully general vector.

33434

return true;

33435

}

33436

33437

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

33438

switch (Opcode) {

33439

// These are non-commutative binops.

33440

// TODO: Add more X86ISD opcodes once we have test coverage.

33441

case X86ISD::ANDNP:

33442

case X86ISD::PCMPGT:

33443

case X86ISD::FMAX:

33444

case X86ISD::FMIN:

33445

case X86ISD::FANDN:

33446

case X86ISD::VPSHA:

33447

case X86ISD::VPSHL:

33448

case X86ISD::VSHLV:

33449

case X86ISD::VSRLV:

33450

case X86ISD::VSRAV:

33451

return true;

33452

}

33453

33454

return TargetLoweringBase::isBinOp(Opcode);

33455

}

33456

33457

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

33458

switch (Opcode) {

33459

// TODO: Add more X86ISD opcodes once we have test coverage.

33460

case X86ISD::PCMPEQ:

33461

case X86ISD::PMULDQ:

33462

case X86ISD::PMULUDQ:

33463

case X86ISD::FMAXC:

33464

case X86ISD::FMINC:

33465

case X86ISD::FAND:

33466

case X86ISD::FOR:

33467

case X86ISD::FXOR:

33468

return true;

33469

}

33470

33471

return TargetLoweringBase::isCommutativeBinOp(Opcode);

33472

}

33473

33474

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

33475

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

33476

return false;

33477

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

33478

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

33479

return NumBits1 > NumBits2;

33480

}

33481

33482

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

33483

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

33484

return false;

33485

33486

if (!isTypeLegal(EVT::getEVT(Ty1)))

33487

return false;

33488

33489

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33489, __extension__
__PRETTY_FUNCTION__));

33490

33491

// Assuming the caller doesn't have a zeroext or signext return parameter,

33492

// truncation all the way down to i1 is valid.

33493

return true;

33494

}

33495

33496

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

33497

return isInt<32>(Imm);

33498

}

33499

33500

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

33501

// Can also use sub to handle negated immediates.

33502

return isInt<32>(Imm);

33503

}

33504

33505

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

33506

return isInt<32>(Imm);

33507

}

33508

33509

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

33510

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

33511

return false;

33512

unsigned NumBits1 = VT1.getSizeInBits();

33513

unsigned NumBits2 = VT2.getSizeInBits();

33514

return NumBits1 > NumBits2;

33515

}

33516

33517

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

33518

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

33519

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

33520

}

33521

33522

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

33523

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

33524

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

33525

}

33526

33527

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

33528

EVT VT1 = Val.getValueType();

33529

if (isZExtFree(VT1, VT2))

33530

return true;

33531

33532

if (Val.getOpcode() != ISD::LOAD)

33533

return false;

33534

33535

if (!VT1.isSimple() || !VT1.isInteger() ||

33536

!VT2.isSimple() || !VT2.isInteger())

33537

return false;

33538

33539

switch (VT1.getSimpleVT().SimpleTy) {

33540

default: break;

33541

case MVT::i8:

33542

case MVT::i16:

33543

case MVT::i32:

33544

// X86 has 8, 16, and 32-bit zero-extending loads.

33545

return true;

33546

}

33547

33548

return false;

33549

}

33550

33551

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

33552

SmallVectorImpl<Use *> &Ops) const {

33553

using namespace llvm::PatternMatch;

33554

33555

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());

33556

if (!VTy)

33557

return false;

33558

33559

if (I->getOpcode() == Instruction::Mul &&

33560

VTy->getElementType()->isIntegerTy(64)) {

33561

for (auto &Op : I->operands()) {

33562

// Make sure we are not already sinking this operand

33563

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

33564

continue;

33565

33566

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or

33567

// the PMULUDQ pattern where the input is a zext_inreg from vXi32.

33568

if (Subtarget.hasSSE41() &&

33569

match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),

33570

m_SpecificInt(32)))) {

33571

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

33572

Ops.push_back(&Op);

33573

} else if (Subtarget.hasSSE2() &&

33574

match(Op.get(),

33575

m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {

33576

Ops.push_back(&Op);

33577

}

33578

}

33579

33580

return !Ops.empty();

33581

}

33582

33583

// A uniform shift amount in a vector shift or funnel shift may be much

33584

// cheaper than a generic variable vector shift, so make that pattern visible

33585

// to SDAG by sinking the shuffle instruction next to the shift.

33586

int ShiftAmountOpNum = -1;

33587

if (I->isShift())

33588

ShiftAmountOpNum = 1;

33589

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

33590

if (II->getIntrinsicID() == Intrinsic::fshl ||

33591

II->getIntrinsicID() == Intrinsic::fshr)

33592

ShiftAmountOpNum = 2;

33593

}

33594

33595

if (ShiftAmountOpNum == -1)

33596

return false;

33597

33598

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

33599

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

33600

isVectorShiftByScalarCheap(I->getType())) {

33601

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

33602

return true;

33603

}

33604

33605

return false;

33606

}

33607

33608

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

33609

if (!Subtarget.is64Bit())

33610

return false;

33611

return TargetLowering::shouldConvertPhiType(From, To);

33612

}

33613

33614

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

33615

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

33616

return false;

33617

33618

EVT SrcVT = ExtVal.getOperand(0).getValueType();

33619

33620

// There is no extending load for vXi1.

33621

if (SrcVT.getScalarType() == MVT::i1)

33622

return false;

33623

33624

return true;

33625

}

33626

33627

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

33628

EVT VT) const {

33629

if (!Subtarget.hasAnyFMA())

33630

return false;

33631

33632

VT = VT.getScalarType();

33633

33634

if (!VT.isSimple())

33635

return false;

33636

33637

switch (VT.getSimpleVT().SimpleTy) {

33638

case MVT::f16:

33639

return Subtarget.hasFP16();

33640

case MVT::f32:

33641

case MVT::f64:

33642

return true;

33643

default:

33644

break;

33645

}

33646

33647

return false;

33648

}

33649

33650

bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

33651

// i16 instructions are longer (0x66 prefix) and potentially slower.

33652

return !(VT1 == MVT::i32 && VT2 == MVT::i16);

33653

}

33654

33655

bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,

33656

EVT VT) const {

33657

// TODO: This is too general. There are cases where pre-AVX512 codegen would

33658

// benefit. The transform may also be profitable for scalar code.

33659

if (!Subtarget.hasAVX512())

33660

return false;

33661

if (!Subtarget.hasVLX() && !VT.is512BitVector())

33662

return false;

33663

if (!VT.isVector())

33664

return false;

33665

33666

return true;

33667

}

33668

33669

/// Targets can use this to indicate that they only support *some*

33670

/// VECTOR_SHUFFLE operations, those with specific masks.

33671

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

33672

/// are assumed to be legal.

33673

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

33674

if (!VT.isSimple())

33675

return false;

33676

33677

// Not for i1 vectors

33678

if (VT.getSimpleVT().getScalarType() == MVT::i1)

33679

return false;

33680

33681

// Very little shuffling can be done for 64-bit vectors right now.

33682

if (VT.getSimpleVT().getSizeInBits() == 64)

33683

return false;

33684

33685

// We only care that the types being shuffled are legal. The lowering can

33686

// handle any possible shuffle mask that results.

33687

return isTypeLegal(VT.getSimpleVT());

33688

}

33689

33690

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

33691

EVT VT) const {

33692

// Don't convert an 'and' into a shuffle that we don't directly support.

33693

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

33694

if (!Subtarget.hasAVX2())

33695

if (VT == MVT::v32i8 || VT == MVT::v16i16)

33696

return false;

33697

33698

// Just delegate to the generic legality, clear masks aren't special.

33699

return isShuffleMaskLegal(Mask, VT);

33700

}

33701

33702

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

33703

// If the subtarget is using thunks, we need to not generate jump tables.

33704

if (Subtarget.useIndirectThunkBranches())

33705

return false;

33706

33707

// Otherwise, fallback on the generic logic.

33708

return TargetLowering::areJTsAllowed(Fn);

33709

}

33710

33711

//===----------------------------------------------------------------------===//

33712

// X86 Scheduler Hooks

33713

//===----------------------------------------------------------------------===//

33714

33715

// Returns true if EFLAG is consumed after this iterator in the rest of the

33716

// basic block or any successors of the basic block.

33717

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

33718

MachineBasicBlock *BB) {

33719

// Scan forward through BB for a use/def of EFLAGS.

33720

for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {

33721

if (mi.readsRegister(X86::EFLAGS))

33722

return true;

33723

// If we found a def, we can stop searching.

33724

if (mi.definesRegister(X86::EFLAGS))

33725

return false;

33726

}

33727

33728

// If we hit the end of the block, check whether EFLAGS is live into a

33729

// successor.

33730

for (MachineBasicBlock *Succ : BB->successors())

33731

if (Succ->isLiveIn(X86::EFLAGS))

33732

return true;

33733

33734

return false;

33735

}

33736

33737

/// Utility function to emit xbegin specifying the start of an RTM region.

33738

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

33739

const TargetInstrInfo *TII) {

33740

const DebugLoc &DL = MI.getDebugLoc();

33741

33742

const BasicBlock *BB = MBB->getBasicBlock();

33743

MachineFunction::iterator I = ++MBB->getIterator();

33744

33745

// For the v = xbegin(), we generate

33746

//

33747

// thisMBB:

33748

// xbegin sinkMBB

33749

//

33750

// mainMBB:

33751

// s0 = -1

33752

//

33753

// fallBB:

33754

// eax = # XABORT_DEF

33755

// s1 = eax

33756

//

33757

// sinkMBB:

33758

// v = phi(s0/mainBB, s1/fallBB)

33759

33760

MachineBasicBlock *thisMBB = MBB;

33761

MachineFunction *MF = MBB->getParent();

33762

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

33763

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

33764

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

33765

MF->insert(I, mainMBB);

33766

MF->insert(I, fallMBB);

33767

MF->insert(I, sinkMBB);

33768

33769

if (isEFLAGSLiveAfter(MI, MBB)) {

33770

mainMBB->addLiveIn(X86::EFLAGS);

33771

fallMBB->addLiveIn(X86::EFLAGS);

33772

sinkMBB->addLiveIn(X86::EFLAGS);

33773

}

33774

33775

// Transfer the remainder of BB and its successor edges to sinkMBB.

33776

sinkMBB->splice(sinkMBB->begin(), MBB,

33777

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

33778

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

33779

33780

MachineRegisterInfo &MRI = MF->getRegInfo();

33781

Register DstReg = MI.getOperand(0).getReg();

33782

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

33783

Register mainDstReg = MRI.createVirtualRegister(RC);

33784

Register fallDstReg = MRI.createVirtualRegister(RC);

33785

33786

// thisMBB:

33787

// xbegin fallMBB

33788

// # fallthrough to mainMBB

33789

// # abortion to fallMBB

33790

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

33791

thisMBB->addSuccessor(mainMBB);

33792

thisMBB->addSuccessor(fallMBB);

33793

33794

// mainMBB:

33795

// mainDstReg := -1

33796

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

33797

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

33798

mainMBB->addSuccessor(sinkMBB);

33799

33800

// fallMBB:

33801

// ; pseudo instruction to model hardware's definition from XABORT

33802

// EAX := XABORT_DEF

33803

// fallDstReg := EAX

33804

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

33805

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

33806

.addReg(X86::EAX);

33807

fallMBB->addSuccessor(sinkMBB);

33808

33809

// sinkMBB:

33810

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

33811

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

33812

.addReg(mainDstReg).addMBB(mainMBB)

33813

.addReg(fallDstReg).addMBB(fallMBB);

33814

33815

MI.eraseFromParent();

33816

return sinkMBB;

33817

}

33818

33819

MachineBasicBlock *

33820

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

33821

MachineBasicBlock *MBB) const {

33822

// Emit va_arg instruction on X86-64.

33823

33824

// Operands to this pseudo-instruction:

33825

// 0 ) Output : destination address (reg)

33826

// 1-5) Input : va_list address (addr, i64mem)

33827

// 6 ) ArgSize : Size (in bytes) of vararg type

33828

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

33829

// 8 ) Align : Alignment of type

33830

// 9 ) EFLAGS (implicit-def)

33831

33832

assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33832, __extension__
__PRETTY_FUNCTION__));

33833

static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");

33834

33835

Register DestReg = MI.getOperand(0).getReg();

33836

MachineOperand &Base = MI.getOperand(1);

33837

MachineOperand &Scale = MI.getOperand(2);

33838

MachineOperand &Index = MI.getOperand(3);

33839

MachineOperand &Disp = MI.getOperand(4);

33840

MachineOperand &Segment = MI.getOperand(5);

33841

unsigned ArgSize = MI.getOperand(6).getImm();

33842

unsigned ArgMode = MI.getOperand(7).getImm();

33843

Align Alignment = Align(MI.getOperand(8).getImm());

33844

33845

MachineFunction *MF = MBB->getParent();

33846

33847

// Memory Reference

33848

assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33848, __extension__
__PRETTY_FUNCTION__));

33849

33850

MachineMemOperand *OldMMO = MI.memoperands().front();

33851

33852

// Clone the MMO into two separate MMOs for loading and storing

33853

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

33854

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

33855

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

33856

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

33857

33858

// Machine Information

33859

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

33860

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

33861

const TargetRegisterClass *AddrRegClass =

33862

getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

33863

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

33864

const DebugLoc &DL = MI.getDebugLoc();

33865

33866

// struct va_list {

33867

// i32 gp_offset

33868

// i32 fp_offset

33869

// i64 overflow_area (address)

33870

// i64 reg_save_area (address)

33871

// }

33872

// sizeof(va_list) = 24

33873

// alignment(va_list) = 8

33874

33875

unsigned TotalNumIntRegs = 6;

33876

unsigned TotalNumXMMRegs = 8;

33877

bool UseGPOffset = (ArgMode == 1);

33878

bool UseFPOffset = (ArgMode == 2);

33879

unsigned MaxOffset = TotalNumIntRegs * 8 +

33880

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

33881

33882

/* Align ArgSize to a multiple of 8 */

33883

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

33884

bool NeedsAlign = (Alignment > 8);

33885

33886

MachineBasicBlock *thisMBB = MBB;

33887

MachineBasicBlock *overflowMBB;

33888

MachineBasicBlock *offsetMBB;

33889

MachineBasicBlock *endMBB;

33890

33891

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

33892

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

33893

unsigned OffsetReg = 0;

33894

33895

if (!UseGPOffset && !UseFPOffset) {

33896

// If we only pull from the overflow region, we don't create a branch.

33897

// We don't need to alter control flow.

33898

OffsetDestReg = 0; // unused

33899

OverflowDestReg = DestReg;

33900

33901

offsetMBB = nullptr;

33902

overflowMBB = thisMBB;

33903

endMBB = thisMBB;

33904

} else {

33905

// First emit code to check if gp_offset (or fp_offset) is below the bound.

33906

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

33907

// If not, pull from overflow_area. (branch to overflowMBB)

33908

//

33909

// thisMBB

33910

// | .

33911

// | .

33912

// offsetMBB overflowMBB

33913

// | .

33914

// | .

33915

// endMBB

33916

33917

// Registers for the PHI in endMBB

33918

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

33919

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

33920

33921

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

33922

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

33923

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

33924

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

33925

33926

MachineFunction::iterator MBBIter = ++MBB->getIterator();

33927

33928

// Insert the new basic blocks

33929

MF->insert(MBBIter, offsetMBB);

33930

MF->insert(MBBIter, overflowMBB);

33931

MF->insert(MBBIter, endMBB);

33932

33933

// Transfer the remainder of MBB and its successor edges to endMBB.

33934

endMBB->splice(endMBB->begin(), thisMBB,

33935

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

33936

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

33937

33938

// Make offsetMBB and overflowMBB successors of thisMBB

33939

thisMBB->addSuccessor(offsetMBB);

33940

thisMBB->addSuccessor(overflowMBB);

33941

33942

// endMBB is a successor of both offsetMBB and overflowMBB

33943

offsetMBB->addSuccessor(endMBB);

33944

overflowMBB->addSuccessor(endMBB);

33945

33946

// Load the offset value into a register

33947

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

33948

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

33949

.add(Base)

33950

.add(Scale)

33951

.add(Index)

33952

.addDisp(Disp, UseFPOffset ? 4 : 0)

33953

.add(Segment)

33954

.setMemRefs(LoadOnlyMMO);

33955

33956

// Check if there is enough room left to pull this argument.

33957

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

33958

.addReg(OffsetReg)

33959

.addImm(MaxOffset + 8 - ArgSizeA8);

33960

33961

// Branch to "overflowMBB" if offset >= max

33962

// Fall through to "offsetMBB" otherwise

33963

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

33964

.addMBB(overflowMBB).addImm(X86::COND_AE);

33965

}

33966

33967

// In offsetMBB, emit code to use the reg_save_area.

33968

if (offsetMBB) {

33969

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33969, __extension__ __PRETTY_FUNCTION__));

33970

33971

// Read the reg_save_area address.

33972

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

33973

BuildMI(

33974

offsetMBB, DL,

33975

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

33976

RegSaveReg)

33977

.add(Base)

33978

.add(Scale)

33979

.add(Index)

33980

.addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

33981

.add(Segment)

33982

.setMemRefs(LoadOnlyMMO);

33983

33984

if (Subtarget.isTarget64BitLP64()) {

33985

// Zero-extend the offset

33986

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

33987

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

33988

.addImm(0)

33989

.addReg(OffsetReg)

33990

.addImm(X86::sub_32bit);

33991

33992

// Add the offset to the reg_save_area to get the final address.

33993

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

33994

.addReg(OffsetReg64)

33995

.addReg(RegSaveReg);

33996

} else {

33997

// Add the offset to the reg_save_area to get the final address.

33998

BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)

33999

.addReg(OffsetReg)

34000

.addReg(RegSaveReg);

34001

}

34002

34003

// Compute the offset for the next argument

34004

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

34005

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

34006

.addReg(OffsetReg)

34007

.addImm(UseFPOffset ? 16 : 8);

34008

34009

// Store it back into the va_list.

34010

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

34011

.add(Base)

34012

.add(Scale)

34013

.add(Index)

34014

.addDisp(Disp, UseFPOffset ? 4 : 0)

34015

.add(Segment)

34016

.addReg(NextOffsetReg)

34017

.setMemRefs(StoreOnlyMMO);

34018

34019

// Jump to endMBB

34020

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

34021

.addMBB(endMBB);

34022

}

34023

34024

//

34025

// Emit code to use overflow area

34026

//

34027

34028

// Load the overflow_area address into a register.

34029

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

34030

BuildMI(overflowMBB, DL,

34031

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

34032

OverflowAddrReg)

34033

.add(Base)

34034

.add(Scale)

34035

.add(Index)

34036

.addDisp(Disp, 8)

34037

.add(Segment)

34038

.setMemRefs(LoadOnlyMMO);

34039

34040

// If we need to align it, do so. Otherwise, just copy the address

34041

// to OverflowDestReg.

34042

if (NeedsAlign) {

34043

// Align the overflow address

34044

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

34045

34046

// aligned_addr = (addr + (align-1)) & ~(align-1)

34047

BuildMI(

34048

overflowMBB, DL,

34049

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

34050

TmpReg)

34051

.addReg(OverflowAddrReg)

34052

.addImm(Alignment.value() - 1);

34053

34054

BuildMI(

34055

overflowMBB, DL,

34056

TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

34057

OverflowDestReg)

34058

.addReg(TmpReg)

34059

.addImm(~(uint64_t)(Alignment.value() - 1));

34060

} else {

34061

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

34062

.addReg(OverflowAddrReg);

34063

}

34064

34065

// Compute the next overflow address after this argument.

34066

// (the overflow address should be kept 8-byte aligned)

34067

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

34068

BuildMI(

34069

overflowMBB, DL,

34070

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

34071

NextAddrReg)

34072

.addReg(OverflowDestReg)

34073

.addImm(ArgSizeA8);

34074

34075

// Store the new overflow address.

34076

BuildMI(overflowMBB, DL,

34077

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

34078

.add(Base)

34079

.add(Scale)

34080

.add(Index)

34081

.addDisp(Disp, 8)

34082

.add(Segment)

34083

.addReg(NextAddrReg)

34084

.setMemRefs(StoreOnlyMMO);

34085

34086

// If we branched, emit the PHI to the front of endMBB.

34087

if (offsetMBB) {

34088

BuildMI(*endMBB, endMBB->begin(), DL,

34089

TII->get(X86::PHI), DestReg)

34090

.addReg(OffsetDestReg).addMBB(offsetMBB)

34091

.addReg(OverflowDestReg).addMBB(overflowMBB);

34092

}

34093

34094

// Erase the pseudo instruction

34095

MI.eraseFromParent();

34096

34097

return endMBB;

34098

}

34099

34100

// The EFLAGS operand of SelectItr might be missing a kill marker

34101

// because there were multiple uses of EFLAGS, and ISel didn't know

34102

// which to mark. Figure out whether SelectItr should have had a

34103

// kill marker, and set it if it should. Returns the correct kill

34104

// marker value.

34105

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

34106

MachineBasicBlock* BB,

34107

const TargetRegisterInfo* TRI) {

34108

if (isEFLAGSLiveAfter(SelectItr, BB))

34109

return false;

34110

34111

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

34112

// out. SelectMI should have a kill flag on EFLAGS.

34113

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

34114

return true;

34115

}

34116

34117

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

34118

// together with other CMOV pseudo-opcodes into a single basic-block with

34119

// conditional jump around it.

34120

static bool isCMOVPseudo(MachineInstr &MI) {

34121

switch (MI.getOpcode()) {

34122

case X86::CMOV_FR16X:

34123

case X86::CMOV_FR32:

34124

case X86::CMOV_FR32X:

34125

case X86::CMOV_FR64:

34126

case X86::CMOV_FR64X:

34127

case X86::CMOV_GR8:

34128

case X86::CMOV_GR16:

34129

case X86::CMOV_GR32:

34130

case X86::CMOV_RFP32:

34131

case X86::CMOV_RFP64:

34132

case X86::CMOV_RFP80:

34133

case X86::CMOV_VR64:

34134

case X86::CMOV_VR128:

34135

case X86::CMOV_VR128X:

34136

case X86::CMOV_VR256:

34137

case X86::CMOV_VR256X:

34138

case X86::CMOV_VR512:

34139

case X86::CMOV_VK1:

34140

case X86::CMOV_VK2:

34141

case X86::CMOV_VK4:

34142

case X86::CMOV_VK8:

34143

case X86::CMOV_VK16:

34144

case X86::CMOV_VK32:

34145

case X86::CMOV_VK64:

34146

return true;

34147

34148

default:

34149

return false;

34150

}

34151

}

34152

34153

// Helper function, which inserts PHI functions into SinkMBB:

34154

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

34155

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

34156

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

34157

// the last PHI function inserted.

34158

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

34159

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

34160

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

34161

MachineBasicBlock *SinkMBB) {

34162

MachineFunction *MF = TrueMBB->getParent();

34163

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

34164

const DebugLoc &DL = MIItBegin->getDebugLoc();

34165

34166

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

34167

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

34168

34169

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

34170

34171

// As we are creating the PHIs, we have to be careful if there is more than

34172

// one. Later CMOVs may reference the results of earlier CMOVs, but later

34173

// PHIs have to reference the individual true/false inputs from earlier PHIs.

34174

// That also means that PHI construction must work forward from earlier to

34175

// later, and that the code must maintain a mapping from earlier PHI's

34176

// destination registers, and the registers that went into the PHI.

34177

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

34178

MachineInstrBuilder MIB;

34179

34180

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

34181

Register DestReg = MIIt->getOperand(0).getReg();

34182

Register Op1Reg = MIIt->getOperand(1).getReg();

34183

Register Op2Reg = MIIt->getOperand(2).getReg();

34184

34185

// If this CMOV we are generating is the opposite condition from

34186

// the jump we generated, then we have to swap the operands for the

34187

// PHI that is going to be generated.

34188

if (MIIt->getOperand(3).getImm() == OppCC)

34189

std::swap(Op1Reg, Op2Reg);

34190

34191

if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())

34192

Op1Reg = RegRewriteTable[Op1Reg].first;

34193

34194

if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())

34195

Op2Reg = RegRewriteTable[Op2Reg].second;

34196

34197

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

34198

.addReg(Op1Reg)

34199

.addMBB(FalseMBB)

34200

.addReg(Op2Reg)

34201

.addMBB(TrueMBB);

34202

34203

// Add this PHI to the rewrite table.

34204

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

34205

}

34206

34207

return MIB;

34208

}

34209

34210

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

34211

MachineBasicBlock *

34212

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

34213

MachineInstr &SecondCascadedCMOV,

34214

MachineBasicBlock *ThisMBB) const {

34215

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

34216

const DebugLoc &DL = FirstCMOV.getDebugLoc();

34217

34218

// We lower cascaded CMOVs such as

34219

//

34220

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

34221

//

34222

// to two successive branches.

34223

//

34224

// Without this, we would add a PHI between the two jumps, which ends up

34225

// creating a few copies all around. For instance, for

34226

//

34227

// (sitofp (zext (fcmp une)))

34228

//

34229

// we would generate:

34230

//

34231

// ucomiss %xmm1, %xmm0

34232

// movss <1.0f>, %xmm0

34233

// movaps %xmm0, %xmm1

34234

// jne .LBB5_2

34235

// xorps %xmm1, %xmm1

34236

// .LBB5_2:

34237

// jp .LBB5_4

34238

// movaps %xmm1, %xmm0

34239

// .LBB5_4:

34240

// retq

34241

//

34242

// because this custom-inserter would have generated:

34243

//

34244

// A

34245

// | \

34246

// | B

34247

// | /

34248

// C

34249

// | \

34250

// | D

34251

// | /

34252

// E

34253

//

34254

// A: X = ...; Y = ...

34255

// B: empty

34256

// C: Z = PHI [X, A], [Y, B]

34257

// D: empty

34258

// E: PHI [X, C], [Z, D]

34259

//

34260

// If we lower both CMOVs in a single step, we can instead generate:

34261

//

34262

// A

34263

// | \

34264

// | C

34265

// | /|

34266

// |/ |

34267

// | |

34268

// | D

34269

// | /

34270

// E

34271

//

34272

// A: X = ...; Y = ...

34273

// D: empty

34274

// E: PHI [X, A], [X, C], [Y, D]

34275

//

34276

// Which, in our sitofp/fcmp example, gives us something like:

34277

//

34278

// ucomiss %xmm1, %xmm0

34279

// movss <1.0f>, %xmm0

34280

// jne .LBB5_4

34281

// jp .LBB5_4

34282

// xorps %xmm0, %xmm0

34283

// .LBB5_4:

34284

// retq

34285

//

34286

34287

// We lower cascaded CMOV into two successive branches to the same block.

34288

// EFLAGS is used by both, so mark it as live in the second.

34289

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

34290

MachineFunction *F = ThisMBB->getParent();

34291

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

34292

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

34293

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

34294

34295

MachineFunction::iterator It = ++ThisMBB->getIterator();

34296

F->insert(It, FirstInsertedMBB);

34297

F->insert(It, SecondInsertedMBB);

34298

F->insert(It, SinkMBB);

34299

34300

// For a cascaded CMOV, we lower it to two successive branches to

34301

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

34302

// the FirstInsertedMBB.

34303

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

34304

34305

// If the EFLAGS register isn't dead in the terminator, then claim that it's

34306

// live into the sink and copy blocks.

34307

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

34308

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

34309

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

34310

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

34311

SinkMBB->addLiveIn(X86::EFLAGS);

34312

}

34313

34314

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

34315

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

34316

std::next(MachineBasicBlock::iterator(FirstCMOV)),

34317

ThisMBB->end());

34318

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

34319

34320

// Fallthrough block for ThisMBB.

34321

ThisMBB->addSuccessor(FirstInsertedMBB);

34322

// The true block target of the first branch is always SinkMBB.

34323

ThisMBB->addSuccessor(SinkMBB);

34324

// Fallthrough block for FirstInsertedMBB.

34325

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

34326

// The true block for the branch of FirstInsertedMBB.

34327

FirstInsertedMBB->addSuccessor(SinkMBB);

34328

// This is fallthrough.

34329

SecondInsertedMBB->addSuccessor(SinkMBB);

34330

34331

// Create the conditional branch instructions.

34332

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

34333

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

34334

34335

X86::CondCode SecondCC =

34336

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

34337

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

34338

34339

// SinkMBB:

34340

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

34341

Register DestReg = FirstCMOV.getOperand(0).getReg();

34342

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

34343

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

34344

MachineInstrBuilder MIB =

34345

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

34346

.addReg(Op1Reg)

34347

.addMBB(SecondInsertedMBB)

34348

.addReg(Op2Reg)

34349

.addMBB(ThisMBB);

34350

34351

// The second SecondInsertedMBB provides the same incoming value as the

34352

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

34353

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

34354

// Copy the PHI result to the register defined by the second CMOV.

34355

BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,

34356

TII->get(TargetOpcode::COPY),

34357

SecondCascadedCMOV.getOperand(0).getReg())

34358

.addReg(FirstCMOV.getOperand(0).getReg());

34359

34360

// Now remove the CMOVs.

34361

FirstCMOV.eraseFromParent();

34362

SecondCascadedCMOV.eraseFromParent();

34363

34364

return SinkMBB;

34365

}

34366

34367

MachineBasicBlock *

34368

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

34369

MachineBasicBlock *ThisMBB) const {

34370

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

34371

const DebugLoc &DL = MI.getDebugLoc();

34372

34373

// To "insert" a SELECT_CC instruction, we actually have to insert the

34374

// diamond control-flow pattern. The incoming instruction knows the

34375

// destination vreg to set, the condition code register to branch on, the

34376

// true/false values to select between and a branch opcode to use.

34377

34378

// ThisMBB:

34379

// ...

34380

// TrueVal = ...

34381

// cmpTY ccX, r1, r2

34382

// bCC copy1MBB

34383

// fallthrough --> FalseMBB

34384

34385

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

34386

// as described above, by inserting a BB, and then making a PHI at the join

34387

// point to select the true and false operands of the CMOV in the PHI.

34388

//

34389

// The code also handles two different cases of multiple CMOV opcodes

34390

// in a row.

34391

//

34392

// Case 1:

34393

// In this case, there are multiple CMOVs in a row, all which are based on

34394

// the same condition setting (or the exact opposite condition setting).

34395

// In this case we can lower all the CMOVs using a single inserted BB, and

34396

// then make a number of PHIs at the join point to model the CMOVs. The only

34397

// trickiness here, is that in a case like:

34398

//

34399

// t2 = CMOV cond1 t1, f1

34400

// t3 = CMOV cond1 t2, f2

34401

//

34402

// when rewriting this into PHIs, we have to perform some renaming on the

34403

// temps since you cannot have a PHI operand refer to a PHI result earlier

34404

// in the same block. The "simple" but wrong lowering would be:

34405

//

34406

// t2 = PHI t1(BB1), f1(BB2)

34407

// t3 = PHI t2(BB1), f2(BB2)

34408

//

34409

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

34410

// renaming is to note that on the path through BB1, t2 is really just a

34411

// copy of t1, and do that renaming, properly generating:

34412

//

34413

// t2 = PHI t1(BB1), f1(BB2)

34414

// t3 = PHI t1(BB1), f2(BB2)

34415

//

34416

// Case 2:

34417

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

34418

// function - EmitLoweredCascadedSelect.

34419

34420

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

34421

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

34422

MachineInstr *LastCMOV = &MI;

34423

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

34424

34425

// Check for case 1, where there are multiple CMOVs with the same condition

34426

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

34427

// number of jumps the most.

34428

34429

if (isCMOVPseudo(MI)) {

34430

// See if we have a string of CMOVS with the same condition. Skip over

34431

// intervening debug insts.

34432

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

34433

(NextMIIt->getOperand(3).getImm() == CC ||

34434

NextMIIt->getOperand(3).getImm() == OppCC)) {

34435

LastCMOV = &*NextMIIt;

34436

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

34437

}

34438

}

34439

34440

// This checks for case 2, but only do this if we didn't already find

34441

// case 1, as indicated by LastCMOV == MI.

34442

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

34443

NextMIIt->getOpcode() == MI.getOpcode() &&

34444

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

34445

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

34446

NextMIIt->getOperand(1).isKill()) {

34447

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

34448

}

34449

34450

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

34451

MachineFunction *F = ThisMBB->getParent();

34452

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

34453

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

34454

34455

MachineFunction::iterator It = ++ThisMBB->getIterator();

34456

F->insert(It, FalseMBB);

34457

F->insert(It, SinkMBB);

34458

34459

// If the EFLAGS register isn't dead in the terminator, then claim that it's

34460

// live into the sink and copy blocks.

34461

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

34462

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

34463

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

34464

FalseMBB->addLiveIn(X86::EFLAGS);

34465

SinkMBB->addLiveIn(X86::EFLAGS);

34466

}

34467

34468

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

34469

auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

34470

MachineBasicBlock::iterator(LastCMOV));

34471

for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

34472

if (MI.isDebugInstr())

34473

SinkMBB->push_back(MI.removeFromParent());

34474

34475

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

34476

SinkMBB->splice(SinkMBB->end(), ThisMBB,

34477

std::next(MachineBasicBlock::iterator(LastCMOV)),

34478

ThisMBB->end());

34479

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

34480

34481

// Fallthrough block for ThisMBB.

34482

ThisMBB->addSuccessor(FalseMBB);

34483

// The true block target of the first (or only) branch is always a SinkMBB.

34484

ThisMBB->addSuccessor(SinkMBB);

34485

// Fallthrough block for FalseMBB.

34486

FalseMBB->addSuccessor(SinkMBB);

34487

34488

// Create the conditional branch instruction.

34489

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

34490

34491

// SinkMBB:

34492

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

34493

// ...

34494

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

34495

MachineBasicBlock::iterator MIItEnd =

34496

std::next(MachineBasicBlock::iterator(LastCMOV));

34497

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

34498

34499

// Now remove the CMOV(s).

34500

ThisMBB->erase(MIItBegin, MIItEnd);

34501

34502

return SinkMBB;

34503

}

34504

34505

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

34506

if (IsLP64) {

34507

if (isInt<8>(Imm))

34508

return X86::SUB64ri8;

34509

return X86::SUB64ri32;

34510

} else {

34511

if (isInt<8>(Imm))

34512

return X86::SUB32ri8;

34513

return X86::SUB32ri;

34514

}

34515

}

34516

34517

MachineBasicBlock *

34518

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

34519

MachineBasicBlock *MBB) const {

34520

MachineFunction *MF = MBB->getParent();

34521

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

34522

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

34523

const DebugLoc &DL = MI.getDebugLoc();

34524

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

34525

34526

const unsigned ProbeSize = getStackProbeSize(*MF);

34527

34528

MachineRegisterInfo &MRI = MF->getRegInfo();

34529

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

34530

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

34531

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

34532

34533

MachineFunction::iterator MBBIter = ++MBB->getIterator();

34534

MF->insert(MBBIter, testMBB);

34535

MF->insert(MBBIter, blockMBB);

34536

MF->insert(MBBIter, tailMBB);

34537

34538

Register sizeVReg = MI.getOperand(1).getReg();

34539

34540

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

34541

34542

Register TmpStackPtr = MRI.createVirtualRegister(

34543

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

34544

Register FinalStackPtr = MRI.createVirtualRegister(

34545

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

34546

34547

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

34548

.addReg(physSPReg);

34549

{

34550

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

34551

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

34552

.addReg(TmpStackPtr)

34553

.addReg(sizeVReg);

34554

}

34555

34556

// test rsp size

34557

34558

BuildMI(testMBB, DL,

34559

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

34560

.addReg(FinalStackPtr)

34561

.addReg(physSPReg);

34562

34563

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

34564

.addMBB(tailMBB)

34565

.addImm(X86::COND_GE);

34566

testMBB->addSuccessor(blockMBB);

34567

testMBB->addSuccessor(tailMBB);

34568

34569

// Touch the block then extend it. This is done on the opposite side of

34570

// static probe where we allocate then touch, to avoid the need of probing the

34571

// tail of the static alloca. Possible scenarios are:

34572

//

34573

// + ---- <- ------------ <- ------------- <- ------------ +

34574

// | |

34575

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

34576

// | |

34577

// + <- ----------- <- ------------ <- ----------- <- ------------ +

34578

//

34579

// The property we want to enforce is to never have more than [page alloc] between two probes.

34580

34581

const unsigned XORMIOpc =

34582

TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;

34583

addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)

34584

.addImm(0);

34585

34586

BuildMI(blockMBB, DL,

34587

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

34588

.addReg(physSPReg)

34589

.addImm(ProbeSize);

34590

34591

34592

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

34593

blockMBB->addSuccessor(testMBB);

34594

34595

// Replace original instruction by the expected stack ptr

34596

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

34597

.addReg(FinalStackPtr);

34598

34599

tailMBB->splice(tailMBB->end(), MBB,

34600

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

34601

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

34602

MBB->addSuccessor(testMBB);

34603

34604

// Delete the original pseudo instruction.

34605

MI.eraseFromParent();

34606

34607

// And we're done.

34608

return tailMBB;

34609

}

34610

34611

MachineBasicBlock *

34612

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

34613

MachineBasicBlock *BB) const {

34614

MachineFunction *MF = BB->getParent();

34615

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

34616

const DebugLoc &DL = MI.getDebugLoc();

34617

const BasicBlock *LLVM_BB = BB->getBasicBlock();

34618

34619

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34619, __extension__ __PRETTY_FUNCTION__));

34620

34621

const bool Is64Bit = Subtarget.is64Bit();

34622

const bool IsLP64 = Subtarget.isTarget64BitLP64();

34623

34624

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

34625

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

34626

34627

// BB:

34628

// ... [Till the alloca]

34629

// If stacklet is not large enough, jump to mallocMBB

34630

//

34631

// bumpMBB:

34632

// Allocate by subtracting from RSP

34633

// Jump to continueMBB

34634

//

34635

// mallocMBB:

34636

// Allocate by call to runtime

34637

//

34638

// continueMBB:

34639

// ...

34640

// [rest of original BB]

34641

//

34642

34643

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

34644

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

34645

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

34646

34647

MachineRegisterInfo &MRI = MF->getRegInfo();

34648

const TargetRegisterClass *AddrRegClass =

34649

getRegClassFor(getPointerTy(MF->getDataLayout()));

34650

34651

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

34652

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

34653

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

34654

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

34655

sizeVReg = MI.getOperand(1).getReg(),

34656

physSPReg =

34657

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

34658

34659

MachineFunction::iterator MBBIter = ++BB->getIterator();

34660

34661

MF->insert(MBBIter, bumpMBB);

34662

MF->insert(MBBIter, mallocMBB);

34663

MF->insert(MBBIter, continueMBB);

34664

34665

continueMBB->splice(continueMBB->begin(), BB,

34666

std::next(MachineBasicBlock::iterator(MI)), BB->end());

34667

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

34668

34669

// Add code to the main basic block to check if the stack limit has been hit,

34670

// and if so, jump to mallocMBB otherwise to bumpMBB.

34671

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

34672

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

34673

.addReg(tmpSPVReg).addReg(sizeVReg);

34674

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

34675

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

34676

.addReg(SPLimitVReg);

34677

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

34678

34679

// bumpMBB simply decreases the stack pointer, since we know the current

34680

// stacklet has enough space.

34681

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

34682

.addReg(SPLimitVReg);

34683

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

34684

.addReg(SPLimitVReg);

34685

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

34686

34687

// Calls into a routine in libgcc to allocate more space from the heap.

34688

const uint32_t *RegMask =

34689

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

34690

if (IsLP64) {

34691

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

34692

.addReg(sizeVReg);

34693

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

34694

.addExternalSymbol("__morestack_allocate_stack_space")

34695

.addRegMask(RegMask)

34696

.addReg(X86::RDI, RegState::Implicit)

34697

.addReg(X86::RAX, RegState::ImplicitDefine);

34698

} else if (Is64Bit) {

34699

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

34700

.addReg(sizeVReg);

34701

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

34702

.addExternalSymbol("__morestack_allocate_stack_space")

34703

.addRegMask(RegMask)

34704

.addReg(X86::EDI, RegState::Implicit)

34705

.addReg(X86::EAX, RegState::ImplicitDefine);

34706

} else {

34707

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

34708

.addImm(12);

34709

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

34710

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

34711

.addExternalSymbol("__morestack_allocate_stack_space")

34712

.addRegMask(RegMask)

34713

.addReg(X86::EAX, RegState::ImplicitDefine);

34714

}

34715

34716

if (!Is64Bit)

34717

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

34718

.addImm(16);

34719

34720

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

34721

.addReg(IsLP64 ? X86::RAX : X86::EAX);

34722

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

34723

34724

// Set up the CFG correctly.

34725

BB->addSuccessor(bumpMBB);

34726

BB->addSuccessor(mallocMBB);

34727

mallocMBB->addSuccessor(continueMBB);

34728

bumpMBB->addSuccessor(continueMBB);

34729

34730

// Take care of the PHI nodes.

34731

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

34732

MI.getOperand(0).getReg())

34733

.addReg(mallocPtrVReg)

34734

.addMBB(mallocMBB)

34735

.addReg(bumpSPPtrVReg)

34736

.addMBB(bumpMBB);

34737

34738

// Delete the original pseudo instruction.

34739

MI.eraseFromParent();

34740

34741

// And we're done.

34742

return continueMBB;

34743

}

34744

34745

MachineBasicBlock *

34746

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

34747

MachineBasicBlock *BB) const {

34748

MachineFunction *MF = BB->getParent();

34749

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

34750

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

34751

const DebugLoc &DL = MI.getDebugLoc();

34752

34753

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))

34754

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__))

34755

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34755, __extension__
__PRETTY_FUNCTION__));

34756

34757

// Only 32-bit EH needs to worry about manually restoring stack pointers.

34758

if (!Subtarget.is32Bit())

34759

return BB;

34760

34761

// C++ EH creates a new target block to hold the restore code, and wires up

34762

// the new block to the return destination with a normal JMP_4.

34763

MachineBasicBlock *RestoreMBB =

34764

MF->CreateMachineBasicBlock(BB->getBasicBlock());

34765

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34765, __extension__ __PRETTY_FUNCTION__));

34766

MF->insert(std::next(BB->getIterator()), RestoreMBB);

34767

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

34768

BB->addSuccessor(RestoreMBB);

34769

MI.getOperand(0).setMBB(RestoreMBB);

34770

34771

// Marking this as an EH pad but not a funclet entry block causes PEI to

34772

// restore stack pointers in the block.

34773

RestoreMBB->setIsEHPad(true);

34774

34775

auto RestoreMBBI = RestoreMBB->begin();

34776

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

34777

return BB;

34778

}

34779

34780

MachineBasicBlock *

34781

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

34782

MachineBasicBlock *BB) const {

34783

// So, here we replace TLSADDR with the sequence:

34784

// adjust_stackdown -> TLSADDR -> adjust_stackup.

34785

// We need this because TLSADDR is lowered into calls

34786

// inside MC, therefore without the two markers shrink-wrapping

34787

// may push the prologue/epilogue pass them.

34788

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

34789

const DebugLoc &DL = MI.getDebugLoc();

34790

MachineFunction &MF = *BB->getParent();

34791

34792

// Emit CALLSEQ_START right before the instruction.

34793

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

34794

MachineInstrBuilder CallseqStart =

34795

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

34796

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

34797

34798

// Emit CALLSEQ_END right after the instruction.

34799

// We don't call erase from parent because we want to keep the

34800

// original instruction around.

34801

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

34802

MachineInstrBuilder CallseqEnd =

34803

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

34804

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

34805

34806

return BB;

34807

}

34808

34809

MachineBasicBlock *

34810

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

34811

MachineBasicBlock *BB) const {

34812

// This is pretty easy. We're taking the value that we received from

34813

// our load from the relocation, sticking it in either RDI (x86-64)

34814

// or EAX and doing an indirect call. The return value will then

34815

// be in the normal return register.

34816

MachineFunction *F = BB->getParent();

34817

const X86InstrInfo *TII = Subtarget.getInstrInfo();

34818

const DebugLoc &DL = MI.getDebugLoc();

34819

34820

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34820, __extension__
__PRETTY_FUNCTION__));

34821

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34821, __extension__
__PRETTY_FUNCTION__));

34822

34823

// Get a register mask for the lowered call.

34824

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

34825

// proper register mask.

34826

const uint32_t *RegMask =

34827

Subtarget.is64Bit() ?

34828

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

34829

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

34830

if (Subtarget.is64Bit()) {

34831

MachineInstrBuilder MIB =

34832

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

34833

.addReg(X86::RIP)

34834

.addImm(0)

34835

.addReg(0)

34836

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

34837

MI.getOperand(3).getTargetFlags())

34838

.addReg(0);

34839

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

34840

addDirectMem(MIB, X86::RDI);

34841

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

34842

} else if (!isPositionIndependent()) {

34843

MachineInstrBuilder MIB =

34844

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

34845

.addReg(0)

34846

.addImm(0)

34847

.addReg(0)

34848

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

34849

MI.getOperand(3).getTargetFlags())

34850

.addReg(0);

34851

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

34852

addDirectMem(MIB, X86::EAX);

34853

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

34854

} else {

34855

MachineInstrBuilder MIB =

34856

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

34857

.addReg(TII->getGlobalBaseReg(F))

34858

.addImm(0)

34859

.addReg(0)

34860

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

34861

MI.getOperand(3).getTargetFlags())

34862

.addReg(0);

34863

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

34864

addDirectMem(MIB, X86::EAX);

34865

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

34866

}

34867

34868

MI.eraseFromParent(); // The pseudo instruction is gone now.

34869

return BB;

34870

}

34871

34872

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

34873

switch (RPOpc) {

34874

case X86::INDIRECT_THUNK_CALL32:

34875

return X86::CALLpcrel32;

34876

case X86::INDIRECT_THUNK_CALL64:

34877

return X86::CALL64pcrel32;

34878

case X86::INDIRECT_THUNK_TCRETURN32:

34879

return X86::TCRETURNdi;

34880

case X86::INDIRECT_THUNK_TCRETURN64:

34881

return X86::TCRETURNdi64;

34882

}

34883

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34883);

34884

}

34885

34886

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

34887

unsigned Reg) {

34888

if (Subtarget.useRetpolineExternalThunk()) {

34889

// When using an external thunk for retpolines, we pick names that match the

34890

// names GCC happens to use as well. This helps simplify the implementation

34891

// of the thunks for kernels where they have no easy ability to create

34892

// aliases and are doing non-trivial configuration of the thunk's body. For

34893

// example, the Linux kernel will do boot-time hot patching of the thunk

34894

// bodies and cannot easily export aliases of these to loaded modules.

34895

//

34896

// Note that at any point in the future, we may need to change the semantics

34897

// of how we implement retpolines and at that time will likely change the

34898

// name of the called thunk. Essentially, there is no hard guarantee that

34899

// LLVM will generate calls to specific thunks, we merely make a best-effort

34900

// attempt to help out kernels and other systems where duplicating the

34901

// thunks is costly.

34902

switch (Reg) {

34903

case X86::EAX:

34904

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34904, __extension__
__PRETTY_FUNCTION__));

34905

return "__x86_indirect_thunk_eax";

34906

case X86::ECX:

34907

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34907, __extension__
__PRETTY_FUNCTION__));

34908

return "__x86_indirect_thunk_ecx";

34909

case X86::EDX:

34910

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34910, __extension__
__PRETTY_FUNCTION__));

34911

return "__x86_indirect_thunk_edx";

34912

case X86::EDI:

34913

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34913, __extension__
__PRETTY_FUNCTION__));

34914

return "__x86_indirect_thunk_edi";

34915

case X86::R11:

34916

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34916, __extension__
__PRETTY_FUNCTION__));

34917

return "__x86_indirect_thunk_r11";

34918

}

34919

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34919);

34920

}

34921

34922

if (Subtarget.useRetpolineIndirectCalls() ||

34923

Subtarget.useRetpolineIndirectBranches()) {

34924

// When targeting an internal COMDAT thunk use an LLVM-specific name.

34925

switch (Reg) {

34926

case X86::EAX:

34927

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34927, __extension__
__PRETTY_FUNCTION__));

34928

return "__llvm_retpoline_eax";

34929

case X86::ECX:

34930

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34930, __extension__
__PRETTY_FUNCTION__));

34931

return "__llvm_retpoline_ecx";

34932

case X86::EDX:

34933

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34933, __extension__
__PRETTY_FUNCTION__));

34934

return "__llvm_retpoline_edx";

34935

case X86::EDI:

34936

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34936, __extension__
__PRETTY_FUNCTION__));

34937

return "__llvm_retpoline_edi";

34938

case X86::R11:

34939

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34939, __extension__
__PRETTY_FUNCTION__));

34940

return "__llvm_retpoline_r11";

34941

}

34942

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34942);

34943

}

34944

34945

if (Subtarget.useLVIControlFlowIntegrity()) {

34946

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34946, __extension__
__PRETTY_FUNCTION__));

34947

return "__llvm_lvi_thunk_r11";

34948

}

34949

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34949);

34950

}

34951

34952

MachineBasicBlock *

34953

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

34954

MachineBasicBlock *BB) const {

34955

// Copy the virtual register into the R11 physical register and

34956

// call the retpoline thunk.

34957

const DebugLoc &DL = MI.getDebugLoc();

34958

const X86InstrInfo *TII = Subtarget.getInstrInfo();

34959

Register CalleeVReg = MI.getOperand(0).getReg();

34960

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

34961

34962

// Find an available scratch register to hold the callee. On 64-bit, we can

34963

// just use R11, but we scan for uses anyway to ensure we don't generate

34964

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

34965

// already a register use operand to the call to hold the callee. If none

34966

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

34967

// register and ESI is the base pointer to realigned stack frames with VLAs.

34968

SmallVector<unsigned, 3> AvailableRegs;

34969

if (Subtarget.is64Bit())

34970

AvailableRegs.push_back(X86::R11);

34971

else

34972

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

34973

34974

// Zero out any registers that are already used.

34975

for (const auto &MO : MI.operands()) {

34976

if (MO.isReg() && MO.isUse())

34977

for (unsigned &Reg : AvailableRegs)

34978

if (Reg == MO.getReg())

34979

Reg = 0;

34980

}

34981

34982

// Choose the first remaining non-zero available register.

34983

unsigned AvailableReg = 0;

34984

for (unsigned MaybeReg : AvailableRegs) {

34985

if (MaybeReg) {

34986

AvailableReg = MaybeReg;

34987

break;

34988

}

34989

}

34990

if (!AvailableReg)

34991

report_fatal_error("calling convention incompatible with retpoline, no "

34992

"available registers");

34993

34994

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

34995

34996

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

34997

.addReg(CalleeVReg);

34998

MI.getOperand(0).ChangeToES(Symbol);

34999

MI.setDesc(TII->get(Opc));

35000

MachineInstrBuilder(*BB->getParent(), &MI)

35001

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

35002

return BB;

35003

}

35004

35005

/// SetJmp implies future control flow change upon calling the corresponding

35006

/// LongJmp.

35007

/// Instead of using the 'return' instruction, the long jump fixes the stack and

35008

/// performs an indirect branch. To do so it uses the registers that were stored

35009

/// in the jump buffer (when calling SetJmp).

35010

/// In case the shadow stack is enabled we need to fix it as well, because some

35011

/// return addresses will be skipped.

35012

/// The function will save the SSP for future fixing in the function

35013

/// emitLongJmpShadowStackFix.

35014

/// \sa emitLongJmpShadowStackFix

35015

/// \param [in] MI The temporary Machine Instruction for the builtin.

35016

/// \param [in] MBB The Machine Basic Block that will be modified.

35017

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

35018

MachineBasicBlock *MBB) const {

35019

const DebugLoc &DL = MI.getDebugLoc();

35020

MachineFunction *MF = MBB->getParent();

35021

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35022

MachineRegisterInfo &MRI = MF->getRegInfo();

35023

MachineInstrBuilder MIB;

35024

35025

// Memory Reference.

35026

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

35027

MI.memoperands_end());

35028

35029

// Initialize a register with zero.

35030

MVT PVT = getPointerTy(MF->getDataLayout());

35031

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

35032

Register ZReg = MRI.createVirtualRegister(PtrRC);

35033

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

35034

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

35035

.addDef(ZReg)

35036

.addReg(ZReg, RegState::Undef)

35037

.addReg(ZReg, RegState::Undef);

35038

35039

// Read the current SSP Register value to the zeroed register.

35040

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

35041

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

35042

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

35043

35044

// Write the SSP register value to offset 3 in input memory buffer.

35045

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

35046

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

35047

const int64_t SSPOffset = 3 * PVT.getStoreSize();

35048

const unsigned MemOpndSlot = 1;

35049

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

35050

if (i == X86::AddrDisp)

35051

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

35052

else

35053

MIB.add(MI.getOperand(MemOpndSlot + i));

35054

}

35055

MIB.addReg(SSPCopyReg);

35056

MIB.setMemRefs(MMOs);

35057

}

35058

35059

MachineBasicBlock *

35060

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

35061

MachineBasicBlock *MBB) const {

35062

const DebugLoc &DL = MI.getDebugLoc();

35063

MachineFunction *MF = MBB->getParent();

35064

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35065

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

35066

MachineRegisterInfo &MRI = MF->getRegInfo();

35067

35068

const BasicBlock *BB = MBB->getBasicBlock();

35069

MachineFunction::iterator I = ++MBB->getIterator();

35070

35071

// Memory Reference

35072

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

35073

MI.memoperands_end());

35074

35075

unsigned DstReg;

35076

unsigned MemOpndSlot = 0;

35077

35078

unsigned CurOp = 0;

35079

35080

DstReg = MI.getOperand(CurOp++).getReg();

35081

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

35082

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35082, __extension__
__PRETTY_FUNCTION__));

35083

(void)TRI;

35084

Register mainDstReg = MRI.createVirtualRegister(RC);

35085

Register restoreDstReg = MRI.createVirtualRegister(RC);

35086

35087

MemOpndSlot = CurOp;

35088

35089

MVT PVT = getPointerTy(MF->getDataLayout());

35090

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35091, __extension__
__PRETTY_FUNCTION__))

35091

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35091, __extension__
__PRETTY_FUNCTION__));

35092

35093

// For v = setjmp(buf), we generate

35094

//

35095

// thisMBB:

35096

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

35097

// SjLjSetup restoreMBB

35098

//

35099

// mainMBB:

35100

// v_main = 0

35101

//

35102

// sinkMBB:

35103

// v = phi(main, restore)

35104

//

35105

// restoreMBB:

35106

// if base pointer being used, load it from frame

35107

// v_restore = 1

35108

35109

MachineBasicBlock *thisMBB = MBB;

35110

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

35111

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

35112

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

35113

MF->insert(I, mainMBB);

35114

MF->insert(I, sinkMBB);

35115

MF->push_back(restoreMBB);

35116

restoreMBB->setHasAddressTaken();

35117

35118

MachineInstrBuilder MIB;

35119

35120

// Transfer the remainder of BB and its successor edges to sinkMBB.

35121

sinkMBB->splice(sinkMBB->begin(), MBB,

35122

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

35123

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

35124

35125

// thisMBB:

35126

unsigned PtrStoreOpc = 0;

35127

unsigned LabelReg = 0;

35128

const int64_t LabelOffset = 1 * PVT.getStoreSize();

35129

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

35130

!isPositionIndependent();

35131

35132

// Prepare IP either in reg or imm.

35133

if (!UseImmLabel) {

35134

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

35135

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

35136

LabelReg = MRI.createVirtualRegister(PtrRC);

35137

if (Subtarget.is64Bit()) {

35138

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

35139

.addReg(X86::RIP)

35140

.addImm(0)

35141

.addReg(0)

35142

.addMBB(restoreMBB)

35143

.addReg(0);

35144

} else {

35145

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

35146

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

35147

.addReg(XII->getGlobalBaseReg(MF))

35148

.addImm(0)

35149

.addReg(0)

35150

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

35151

.addReg(0);

35152

}

35153

} else

35154

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

35155

// Store IP

35156

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

35157

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

35158

if (i == X86::AddrDisp)

35159

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

35160

else

35161

MIB.add(MI.getOperand(MemOpndSlot + i));

35162

}

35163

if (!UseImmLabel)

35164

MIB.addReg(LabelReg);

35165

else

35166

MIB.addMBB(restoreMBB);

35167

MIB.setMemRefs(MMOs);

35168

35169

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

35170

emitSetJmpShadowStackFix(MI, thisMBB);

35171

}

35172

35173

// Setup

35174

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

35175

.addMBB(restoreMBB);

35176

35177

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

35178

MIB.addRegMask(RegInfo->getNoPreservedMask());

35179

thisMBB->addSuccessor(mainMBB);

35180

thisMBB->addSuccessor(restoreMBB);

35181

35182

// mainMBB:

35183

// EAX = 0

35184

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

35185

mainMBB->addSuccessor(sinkMBB);

35186

35187

// sinkMBB:

35188

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

35189

TII->get(X86::PHI), DstReg)

35190

.addReg(mainDstReg).addMBB(mainMBB)

35191

.addReg(restoreDstReg).addMBB(restoreMBB);

35192

35193

// restoreMBB:

35194

if (RegInfo->hasBasePointer(*MF)) {

35195

const bool Uses64BitFramePtr =

35196

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

35197

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

35198

X86FI->setRestoreBasePointer(MF);

35199

Register FramePtr = RegInfo->getFrameRegister(*MF);

35200

Register BasePtr = RegInfo->getBaseRegister();

35201

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

35202

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

35203

FramePtr, true, X86FI->getRestoreBasePointerOffset())

35204

.setMIFlag(MachineInstr::FrameSetup);

35205

}

35206

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

35207

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

35208

restoreMBB->addSuccessor(sinkMBB);

35209

35210

MI.eraseFromParent();

35211

return sinkMBB;

35212

}

35213

35214

/// Fix the shadow stack using the previously saved SSP pointer.

35215

/// \sa emitSetJmpShadowStackFix

35216

/// \param [in] MI The temporary Machine Instruction for the builtin.

35217

/// \param [in] MBB The Machine Basic Block that will be modified.

35218

/// \return The sink MBB that will perform the future indirect branch.

35219

MachineBasicBlock *

35220

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

35221

MachineBasicBlock *MBB) const {

35222

const DebugLoc &DL = MI.getDebugLoc();

35223

MachineFunction *MF = MBB->getParent();

35224

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35225

MachineRegisterInfo &MRI = MF->getRegInfo();

35226

35227

// Memory Reference

35228

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

35229

MI.memoperands_end());

35230

35231

MVT PVT = getPointerTy(MF->getDataLayout());

35232

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

35233

35234

// checkSspMBB:

35235

// xor vreg1, vreg1

35236

// rdssp vreg1

35237

// test vreg1, vreg1

35238

// je sinkMBB # Jump if Shadow Stack is not supported

35239

// fallMBB:

35240

// mov buf+24/12(%rip), vreg2

35241

// sub vreg1, vreg2

35242

// jbe sinkMBB # No need to fix the Shadow Stack

35243

// fixShadowMBB:

35244

// shr 3/2, vreg2

35245

// incssp vreg2 # fix the SSP according to the lower 8 bits

35246

// shr 8, vreg2

35247

// je sinkMBB

35248

// fixShadowLoopPrepareMBB:

35249

// shl vreg2

35250

// mov 128, vreg3

35251

// fixShadowLoopMBB:

35252

// incssp vreg3

35253

// dec vreg2

35254

// jne fixShadowLoopMBB # Iterate until you finish fixing

35255

// # the Shadow Stack

35256

// sinkMBB:

35257

35258

MachineFunction::iterator I = ++MBB->getIterator();

35259

const BasicBlock *BB = MBB->getBasicBlock();

35260

35261

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

35262

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

35263

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

35264

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

35265

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

35266

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

35267

MF->insert(I, checkSspMBB);

35268

MF->insert(I, fallMBB);

35269

MF->insert(I, fixShadowMBB);

35270

MF->insert(I, fixShadowLoopPrepareMBB);

35271

MF->insert(I, fixShadowLoopMBB);

35272

MF->insert(I, sinkMBB);

35273

35274

// Transfer the remainder of BB and its successor edges to sinkMBB.

35275

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

35276

MBB->end());

35277

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

35278

35279

MBB->addSuccessor(checkSspMBB);

35280

35281

// Initialize a register with zero.

35282

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

35283

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

35284

35285

if (PVT == MVT::i64) {

35286

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

35287

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

35288

.addImm(0)

35289

.addReg(ZReg)

35290

.addImm(X86::sub_32bit);

35291

ZReg = TmpZReg;

35292

}

35293

35294

// Read the current SSP Register value to the zeroed register.

35295

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

35296

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

35297

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

35298

35299

// Check whether the result of the SSP register is zero and jump directly

35300

// to the sink.

35301

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

35302

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

35303

.addReg(SSPCopyReg)

35304

.addReg(SSPCopyReg);

35305

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

35306

checkSspMBB->addSuccessor(sinkMBB);

35307

checkSspMBB->addSuccessor(fallMBB);

35308

35309

// Reload the previously saved SSP register value.

35310

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

35311

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

35312

const int64_t SPPOffset = 3 * PVT.getStoreSize();

35313

MachineInstrBuilder MIB =

35314

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

35315

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

35316

const MachineOperand &MO = MI.getOperand(i);

35317

if (i == X86::AddrDisp)

35318

MIB.addDisp(MO, SPPOffset);

35319

else if (MO.isReg()) // Don't add the whole operand, we don't want to

35320

// preserve kill flags.

35321

MIB.addReg(MO.getReg());

35322

else

35323

MIB.add(MO);

35324

}

35325

MIB.setMemRefs(MMOs);

35326

35327

// Subtract the current SSP from the previous SSP.

35328

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

35329

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

35330

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

35331

.addReg(PrevSSPReg)

35332

.addReg(SSPCopyReg);

35333

35334

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

35335

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

35336

fallMBB->addSuccessor(sinkMBB);

35337

fallMBB->addSuccessor(fixShadowMBB);

35338

35339

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

35340

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

35341

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

35342

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

35343

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

35344

.addReg(SspSubReg)

35345

.addImm(Offset);

35346

35347

// Increase SSP when looking only on the lower 8 bits of the delta.

35348

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

35349

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

35350

35351

// Reset the lower 8 bits.

35352

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

35353

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

35354

.addReg(SspFirstShrReg)

35355

.addImm(8);

35356

35357

// Jump if the result of the shift is zero.

35358

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

35359

fixShadowMBB->addSuccessor(sinkMBB);

35360

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

35361

35362

// Do a single shift left.

35363

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

35364

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

35365

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

35366

.addReg(SspSecondShrReg);

35367

35368

// Save the value 128 to a register (will be used next with incssp).

35369

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

35370

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

35371

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

35372

.addImm(128);

35373

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

35374

35375

// Since incssp only looks at the lower 8 bits, we might need to do several

35376

// iterations of incssp until we finish fixing the shadow stack.

35377

Register DecReg = MRI.createVirtualRegister(PtrRC);

35378

Register CounterReg = MRI.createVirtualRegister(PtrRC);

35379

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

35380

.addReg(SspAfterShlReg)

35381

.addMBB(fixShadowLoopPrepareMBB)

35382

.addReg(DecReg)

35383

.addMBB(fixShadowLoopMBB);

35384

35385

// Every iteration we increase the SSP by 128.

35386

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

35387

35388

// Every iteration we decrement the counter by 1.

35389

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

35390

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

35391

35392

// Jump if the counter is not zero yet.

35393

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

35394

fixShadowLoopMBB->addSuccessor(sinkMBB);

35395

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

35396

35397

return sinkMBB;

35398

}

35399

35400

MachineBasicBlock *

35401

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

35402

MachineBasicBlock *MBB) const {

35403

const DebugLoc &DL = MI.getDebugLoc();

35404

MachineFunction *MF = MBB->getParent();

35405

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35406

MachineRegisterInfo &MRI = MF->getRegInfo();

35407

35408

// Memory Reference

35409

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

35410

MI.memoperands_end());

35411

35412

MVT PVT = getPointerTy(MF->getDataLayout());

35413

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35414, __extension__
__PRETTY_FUNCTION__))

35414

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35414, __extension__
__PRETTY_FUNCTION__));

35415

35416

const TargetRegisterClass *RC =

35417

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

35418

Register Tmp = MRI.createVirtualRegister(RC);

35419

// Since FP is only updated here but NOT referenced, it's treated as GPR.

35420

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

35421

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

35422

Register SP = RegInfo->getStackRegister();

35423

35424

MachineInstrBuilder MIB;

35425

35426

const int64_t LabelOffset = 1 * PVT.getStoreSize();

35427

const int64_t SPOffset = 2 * PVT.getStoreSize();

35428

35429

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

35430

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

35431

35432

MachineBasicBlock *thisMBB = MBB;

35433

35434

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

35435

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

35436

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

35437

}

35438

35439

// Reload FP

35440

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

35441

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

35442

const MachineOperand &MO = MI.getOperand(i);

35443

if (MO.isReg()) // Don't add the whole operand, we don't want to

35444

// preserve kill flags.

35445

MIB.addReg(MO.getReg());

35446

else

35447

MIB.add(MO);

35448

}

35449

MIB.setMemRefs(MMOs);

35450

35451

// Reload IP

35452

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

35453

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

35454

const MachineOperand &MO = MI.getOperand(i);

35455

if (i == X86::AddrDisp)

35456

MIB.addDisp(MO, LabelOffset);

35457

else if (MO.isReg()) // Don't add the whole operand, we don't want to

35458

// preserve kill flags.

35459

MIB.addReg(MO.getReg());

35460

else

35461

MIB.add(MO);

35462

}

35463

MIB.setMemRefs(MMOs);

35464

35465

// Reload SP

35466

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

35467

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

35468

if (i == X86::AddrDisp)

35469

MIB.addDisp(MI.getOperand(i), SPOffset);

35470

else

35471

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

35472

// the last instruction of the expansion.

35473

}

35474

MIB.setMemRefs(MMOs);

35475

35476

// Jump

35477

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

35478

35479

MI.eraseFromParent();

35480

return thisMBB;

35481

}

35482

35483

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

35484

MachineBasicBlock *MBB,

35485

MachineBasicBlock *DispatchBB,

35486

int FI) const {

35487

const DebugLoc &DL = MI.getDebugLoc();

35488

MachineFunction *MF = MBB->getParent();

35489

MachineRegisterInfo *MRI = &MF->getRegInfo();

35490

const X86InstrInfo *TII = Subtarget.getInstrInfo();

35491

35492

MVT PVT = getPointerTy(MF->getDataLayout());

35493

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35493, __extension__
__PRETTY_FUNCTION__));

35494

35495

unsigned Op = 0;

35496

unsigned VR = 0;

35497

35498

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

35499

!isPositionIndependent();

35500

35501

if (UseImmLabel) {

35502

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

35503

} else {

35504

const TargetRegisterClass *TRC =

35505

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

35506

VR = MRI->createVirtualRegister(TRC);

35507

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

35508

35509

if (Subtarget.is64Bit())

35510

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

35511

.addReg(X86::RIP)

35512

.addImm(1)

35513

.addReg(0)

35514

.addMBB(DispatchBB)

35515

.addReg(0);

35516

else

35517

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

35518

.addReg(0) /* TII->getGlobalBaseReg(MF) */

35519

.addImm(1)

35520

.addReg(0)

35521

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

35522

.addReg(0);

35523

}

35524

35525

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

35526

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

35527

if (UseImmLabel)

35528

MIB.addMBB(DispatchBB);

35529

else

35530

MIB.addReg(VR);

35531

}

35532

35533

MachineBasicBlock *

35534

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

35535

MachineBasicBlock *BB) const {

35536

const DebugLoc &DL = MI.getDebugLoc();

35537

MachineFunction *MF = BB->getParent();

35538

MachineRegisterInfo *MRI = &MF->getRegInfo();

35539

const X86InstrInfo *TII = Subtarget.getInstrInfo();

35540

int FI = MF->getFrameInfo().getFunctionContextIndex();

35541

35542

// Get a mapping of the call site numbers to all of the landing pads they're

35543

// associated with.

35544

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

35545

unsigned MaxCSNum = 0;

35546

for (auto &MBB : *MF) {

35547

if (!MBB.isEHPad())

35548

continue;

35549

35550

MCSymbol *Sym = nullptr;

35551

for (const auto &MI : MBB) {

35552

if (MI.isDebugInstr())

35553

continue;

35554

35555

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35555, __extension__
__PRETTY_FUNCTION__));

35556

Sym = MI.getOperand(0).getMCSymbol();

35557

break;

35558

}

35559

35560

if (!MF->hasCallSiteLandingPad(Sym))

35561

continue;

35562

35563

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

35564

CallSiteNumToLPad[CSI].push_back(&MBB);

35565

MaxCSNum = std::max(MaxCSNum, CSI);

35566

}

35567

}

35568

35569

// Get an ordered list of the machine basic blocks for the jump table.

35570

std::vector<MachineBasicBlock *> LPadList;

35571

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

35572

LPadList.reserve(CallSiteNumToLPad.size());

35573

35574

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

35575

for (auto &LP : CallSiteNumToLPad[CSI]) {

35576

LPadList.push_back(LP);

35577

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

35578

}

35579

}

35580

35581

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35582, __extension__
__PRETTY_FUNCTION__))

35582

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35582, __extension__
__PRETTY_FUNCTION__));

35583

35584

// Create the MBBs for the dispatch code.

35585

35586

// Shove the dispatch's address into the return slot in the function context.

35587

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

35588

DispatchBB->setIsEHPad(true);

35589

35590

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

35591

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

35592

DispatchBB->addSuccessor(TrapBB);

35593

35594

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

35595

DispatchBB->addSuccessor(DispContBB);

35596

35597

// Insert MBBs.

35598

MF->push_back(DispatchBB);

35599

MF->push_back(DispContBB);

35600

MF->push_back(TrapBB);

35601

35602

// Insert code into the entry block that creates and registers the function

35603

// context.

35604

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

35605

35606

// Create the jump table and associated information

35607

unsigned JTE = getJumpTableEncoding();

35608

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

35609

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

35610

35611

const X86RegisterInfo &RI = TII->getRegisterInfo();

35612

// Add a register mask with no preserved registers. This results in all

35613

// registers being marked as clobbered.

35614

if (RI.hasBasePointer(*MF)) {

35615

const bool FPIs64Bit =

35616

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

35617

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

35618

MFI->setRestoreBasePointer(MF);

35619

35620

Register FP = RI.getFrameRegister(*MF);

35621

Register BP = RI.getBaseRegister();

35622

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

35623

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

35624

MFI->getRestoreBasePointerOffset())

35625

.addRegMask(RI.getNoPreservedMask());

35626

} else {

35627

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

35628

.addRegMask(RI.getNoPreservedMask());

35629

}

35630

35631

// IReg is used as an index in a memory operand and therefore can't be SP

35632

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

35633

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

35634

Subtarget.is64Bit() ? 8 : 4);

35635

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

35636

.addReg(IReg)

35637

.addImm(LPadList.size());

35638

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

35639

35640

if (Subtarget.is64Bit()) {

35641

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

35642

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

35643

35644

// leaq .LJTI0_0(%rip), BReg

35645

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

35646

.addReg(X86::RIP)

35647

.addImm(1)

35648

.addReg(0)

35649

.addJumpTableIndex(MJTI)

35650

.addReg(0);

35651

// movzx IReg64, IReg

35652

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

35653

.addImm(0)

35654

.addReg(IReg)

35655

.addImm(X86::sub_32bit);

35656

35657

switch (JTE) {

35658

case MachineJumpTableInfo::EK_BlockAddress:

35659

// jmpq *(BReg,IReg64,8)

35660

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

35661

.addReg(BReg)

35662

.addImm(8)

35663

.addReg(IReg64)

35664

.addImm(0)

35665

.addReg(0);

35666

break;

35667

case MachineJumpTableInfo::EK_LabelDifference32: {

35668

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

35669

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

35670

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

35671

35672

// movl (BReg,IReg64,4), OReg

35673

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

35674

.addReg(BReg)

35675

.addImm(4)

35676

.addReg(IReg64)

35677

.addImm(0)

35678

.addReg(0);

35679

// movsx OReg64, OReg

35680

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

35681

// addq BReg, OReg64, TReg

35682

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

35683

.addReg(OReg64)

35684

.addReg(BReg);

35685

// jmpq *TReg

35686

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

35687

break;

35688

}

35689

default:

35690

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35690);

35691

}

35692

} else {

35693

// jmpl *.LJTI0_0(,IReg,4)

35694

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

35695

.addReg(0)

35696

.addImm(4)

35697

.addReg(IReg)

35698

.addJumpTableIndex(MJTI)

35699

.addReg(0);

35700

}

35701

35702

// Add the jump table entries as successors to the MBB.

35703

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

35704

for (auto &LP : LPadList)

35705

if (SeenMBBs.insert(LP).second)

35706

DispContBB->addSuccessor(LP);

35707

35708

// N.B. the order the invoke BBs are processed in doesn't matter here.

35709

SmallVector<MachineBasicBlock *, 64> MBBLPads;

35710

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

35711

for (MachineBasicBlock *MBB : InvokeBBs) {

35712

// Remove the landing pad successor from the invoke block and replace it

35713

// with the new dispatch block.

35714

// Keep a copy of Successors since it's modified inside the loop.

35715

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

35716

MBB->succ_rend());

35717

// FIXME: Avoid quadratic complexity.

35718

for (auto MBBS : Successors) {

35719

if (MBBS->isEHPad()) {

35720

MBB->removeSuccessor(MBBS);

35721

MBBLPads.push_back(MBBS);

35722

}

35723

}

35724

35725

MBB->addSuccessor(DispatchBB);

35726

35727

// Find the invoke call and mark all of the callee-saved registers as

35728

// 'implicit defined' so that they're spilled. This prevents code from

35729

// moving instructions to before the EH block, where they will never be

35730

// executed.

35731

for (auto &II : reverse(*MBB)) {

35732

if (!II.isCall())

35733

continue;

35734

35735

DenseMap<unsigned, bool> DefRegs;

35736

for (auto &MOp : II.operands())

35737

if (MOp.isReg())

35738

DefRegs[MOp.getReg()] = true;

35739

35740

MachineInstrBuilder MIB(*MF, &II);

35741

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

35742

unsigned Reg = SavedRegs[RegIdx];

35743

if (!DefRegs[Reg])

35744

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

35745

}

35746

35747

break;

35748

}

35749

}

35750

35751

// Mark all former landing pads as non-landing pads. The dispatch is the only

35752

// landing pad now.

35753

for (auto &LP : MBBLPads)

35754

LP->setIsEHPad(false);

35755

35756

// The instruction is gone now.

35757

MI.eraseFromParent();

35758

return BB;

35759

}

35760

35761

MachineBasicBlock *

35762

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

35763

MachineBasicBlock *BB) const {

35764

MachineFunction *MF = BB->getParent();

35765

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35766

const DebugLoc &DL = MI.getDebugLoc();

35767

35768

auto TMMImmToTMMReg = [](unsigned Imm) {

35769

assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35769, __extension__
__PRETTY_FUNCTION__));

35770

return X86::TMM0 + Imm;

35771

};

35772

switch (MI.getOpcode()) {

35773

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35773);

35774

case X86::TLS_addr32:

35775

case X86::TLS_addr64:

35776

case X86::TLS_addrX32:

35777

case X86::TLS_base_addr32:

35778

case X86::TLS_base_addr64:

35779

case X86::TLS_base_addrX32:

35780

return EmitLoweredTLSAddr(MI, BB);

35781

case X86::INDIRECT_THUNK_CALL32:

35782

case X86::INDIRECT_THUNK_CALL64:

35783

case X86::INDIRECT_THUNK_TCRETURN32:

35784

case X86::INDIRECT_THUNK_TCRETURN64:

35785

return EmitLoweredIndirectThunk(MI, BB);

35786

case X86::CATCHRET:

35787

return EmitLoweredCatchRet(MI, BB);

35788

case X86::SEG_ALLOCA_32:

35789

case X86::SEG_ALLOCA_64:

35790

return EmitLoweredSegAlloca(MI, BB);

35791

case X86::PROBED_ALLOCA_32:

35792

case X86::PROBED_ALLOCA_64:

35793

return EmitLoweredProbedAlloca(MI, BB);

35794

case X86::TLSCall_32:

35795

case X86::TLSCall_64:

35796

return EmitLoweredTLSCall(MI, BB);

35797

case X86::CMOV_FR32:

35798

case X86::CMOV_FR32X:

35799

case X86::CMOV_FR64:

35800

case X86::CMOV_FR64X:

35801

case X86::CMOV_GR8:

35802

case X86::CMOV_GR16:

35803

case X86::CMOV_GR32:

35804

case X86::CMOV_RFP32:

35805

case X86::CMOV_RFP64:

35806

case X86::CMOV_RFP80:

35807

case X86::CMOV_VR64:

35808

case X86::CMOV_VR128:

35809

case X86::CMOV_VR128X:

35810

case X86::CMOV_VR256:

35811

case X86::CMOV_VR256X:

35812

case X86::CMOV_VR512:

35813

case X86::CMOV_VK1:

35814

case X86::CMOV_VK2:

35815

case X86::CMOV_VK4:

35816

case X86::CMOV_VK8:

35817

case X86::CMOV_VK16:

35818

case X86::CMOV_VK32:

35819

case X86::CMOV_VK64:

35820

return EmitLoweredSelect(MI, BB);

35821

35822

case X86::RDFLAGS32:

35823

case X86::RDFLAGS64: {

35824

unsigned PushF =

35825

MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;

35826

unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;

35827

MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));

35828

// Permit reads of the EFLAGS and DF registers without them being defined.

35829

// This intrinsic exists to read external processor state in flags, such as

35830

// the trap flag, interrupt flag, and direction flag, none of which are

35831

// modeled by the backend.

35832

assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35833, __extension__
__PRETTY_FUNCTION__))

35833

"Unexpected register in operand!")(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35833, __extension__
__PRETTY_FUNCTION__));

35834

Push->getOperand(2).setIsUndef();

35835

assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35836, __extension__
__PRETTY_FUNCTION__))

35836

"Unexpected register in operand!")(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35836, __extension__
__PRETTY_FUNCTION__));

35837

Push->getOperand(3).setIsUndef();

35838

BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

35839

35840

MI.eraseFromParent(); // The pseudo is gone now.

35841

return BB;

35842

}

35843

35844

case X86::WRFLAGS32:

35845

case X86::WRFLAGS64: {

35846

unsigned Push =

35847

MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;

35848

unsigned PopF =

35849

MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;

35850

BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());

35851

BuildMI(*BB, MI, DL, TII->get(PopF));

35852

35853

MI.eraseFromParent(); // The pseudo is gone now.

35854

return BB;

35855

}

35856

35857

case X86::FP32_TO_INT16_IN_MEM:

35858

case X86::FP32_TO_INT32_IN_MEM:

35859

case X86::FP32_TO_INT64_IN_MEM:

35860

case X86::FP64_TO_INT16_IN_MEM:

35861

case X86::FP64_TO_INT32_IN_MEM:

35862

case X86::FP64_TO_INT64_IN_MEM:

35863

case X86::FP80_TO_INT16_IN_MEM:

35864

case X86::FP80_TO_INT32_IN_MEM:

35865

case X86::FP80_TO_INT64_IN_MEM: {

35866

// Change the floating point control register to use "round towards zero"

35867

// mode when truncating to an integer value.

35868

int OrigCWFrameIdx =

35869

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

35870

addFrameReference(BuildMI(*BB, MI, DL,

35871

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

35872

35873

// Load the old value of the control word...

35874

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

35875

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

35876

OrigCWFrameIdx);

35877

35878

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

35879

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

35880

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

35881

.addReg(OldCW, RegState::Kill).addImm(0xC00);

35882

35883

// Extract to 16 bits.

35884

Register NewCW16 =

35885

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

35886

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

35887

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

35888

35889

// Prepare memory for FLDCW.

35890

int NewCWFrameIdx =

35891

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

35892

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

35893

NewCWFrameIdx)

35894

.addReg(NewCW16, RegState::Kill);

35895

35896

// Reload the modified control word now...

35897

addFrameReference(BuildMI(*BB, MI, DL,

35898

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

35899

35900

// Get the X86 opcode to use.

35901

unsigned Opc;

35902

switch (MI.getOpcode()) {

35903

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35903);

35904

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

35905

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

35906

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

35907

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

35908

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

35909

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

35910

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

35911

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

35912

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

35913

}

35914

35915

X86AddressMode AM = getAddressFromInstr(&MI, 0);

35916

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

35917

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

35918

35919

// Reload the original control word now.

35920

addFrameReference(BuildMI(*BB, MI, DL,

35921

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

35922

35923

MI.eraseFromParent(); // The pseudo instruction is gone now.

35924

return BB;

35925

}

35926

35927

// xbegin

35928

case X86::XBEGIN:

35929

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

35930

35931

case X86::VAARG_64:

35932

case X86::VAARG_X32:

35933

return EmitVAARGWithCustomInserter(MI, BB);

35934

35935

case X86::EH_SjLj_SetJmp32:

35936

case X86::EH_SjLj_SetJmp64:

35937

return emitEHSjLjSetJmp(MI, BB);

35938

35939

case X86::EH_SjLj_LongJmp32:

35940

case X86::EH_SjLj_LongJmp64:

35941

return emitEHSjLjLongJmp(MI, BB);

35942

35943

case X86::Int_eh_sjlj_setup_dispatch:

35944

return EmitSjLjDispatchBlock(MI, BB);

35945

35946

case TargetOpcode::STATEPOINT:

35947

// As an implementation detail, STATEPOINT shares the STACKMAP format at

35948

// this point in the process. We diverge later.

35949

return emitPatchPoint(MI, BB);

35950

35951

case TargetOpcode::STACKMAP:

35952

case TargetOpcode::PATCHPOINT:

35953

return emitPatchPoint(MI, BB);

35954

35955

case TargetOpcode::PATCHABLE_EVENT_CALL:

35956

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

35957

return BB;

35958

35959

case X86::LCMPXCHG8B: {

35960

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

35961

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

35962

// requires a memory operand. If it happens that current architecture is

35963

// i686 and for current function we need a base pointer

35964

// - which is ESI for i686 - register allocator would not be able to

35965

// allocate registers for an address in form of X(%reg, %reg, Y)

35966

// - there never would be enough unreserved registers during regalloc

35967

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

35968

// We are giving a hand to register allocator by precomputing the address in

35969

// a new vreg using LEA.

35970

35971

// If it is not i686 or there is no base pointer - nothing to do here.

35972

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

35973

return BB;

35974

35975

// Even though this code does not necessarily needs the base pointer to

35976

// be ESI, we check for that. The reason: if this assert fails, there are

35977

// some changes happened in the compiler base pointer handling, which most

35978

// probably have to be addressed somehow here.

35979

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))

35980

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__))

35981

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35981, __extension__
__PRETTY_FUNCTION__));

35982

35983

MachineRegisterInfo &MRI = MF->getRegInfo();

35984

MVT SPTy = getPointerTy(MF->getDataLayout());

35985

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

35986

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

35987

35988

X86AddressMode AM = getAddressFromInstr(&MI, 0);

35989

// Regalloc does not need any help when the memory operand of CMPXCHG8B

35990

// does not use index register.

35991

if (AM.IndexReg == X86::NoRegister)

35992

return BB;

35993

35994

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

35995

// four operand definitions that are E[ABCD] registers. We skip them and

35996

// then insert the LEA.

35997

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

35998

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

35999

RMBBI->definesRegister(X86::EBX) ||

36000

RMBBI->definesRegister(X86::ECX) ||

36001

RMBBI->definesRegister(X86::EDX))) {

36002

++RMBBI;

36003

}

36004

MachineBasicBlock::iterator MBBI(RMBBI);

36005

addFullAddress(

36006

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

36007

36008

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

36009

36010

return BB;

36011

}

36012

case X86::LCMPXCHG16B_NO_RBX: {

36013

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

36014

Register BasePtr = TRI->getBaseRegister();

36015

if (TRI->hasBasePointer(*MF) &&

36016

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

36017

if (!BB->isLiveIn(BasePtr))

36018

BB->addLiveIn(BasePtr);

36019

// Save RBX into a virtual register.

36020

Register SaveRBX =

36021

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

36022

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

36023

.addReg(X86::RBX);

36024

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

36025

MachineInstrBuilder MIB =

36026

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

36027

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

36028

MIB.add(MI.getOperand(Idx));

36029

MIB.add(MI.getOperand(X86::AddrNumOperands));

36030

MIB.addReg(SaveRBX);

36031

} else {

36032

// Simple case, just copy the virtual register to RBX.

36033

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)

36034

.add(MI.getOperand(X86::AddrNumOperands));

36035

MachineInstrBuilder MIB =

36036

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));

36037

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

36038

MIB.add(MI.getOperand(Idx));

36039

}

36040

MI.eraseFromParent();

36041

return BB;

36042

}

36043

case X86::MWAITX: {

36044

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

36045

Register BasePtr = TRI->getBaseRegister();

36046

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

36047

// If no need to save the base pointer, we generate MWAITXrrr,

36048

// else we generate pseudo MWAITX_SAVE_RBX.

36049

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

36050

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

36051

.addReg(MI.getOperand(0).getReg());

36052

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

36053

.addReg(MI.getOperand(1).getReg());

36054

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

36055

.addReg(MI.getOperand(2).getReg());

36056

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

36057

MI.eraseFromParent();

36058

} else {

36059

if (!BB->isLiveIn(BasePtr)) {

36060

BB->addLiveIn(BasePtr);

36061

}

36062

// Parameters can be copied into ECX and EAX but not EBX yet.

36063

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

36064

.addReg(MI.getOperand(0).getReg());

36065

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

36066

.addReg(MI.getOperand(1).getReg());

36067

assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36067, __extension__
__PRETTY_FUNCTION__));

36068

// Save RBX into a virtual register.

36069

Register SaveRBX =

36070

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

36071

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

36072

.addReg(X86::RBX);

36073

// Generate mwaitx pseudo.

36074

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

36075

BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))

36076

.addDef(Dst) // Destination tied in with SaveRBX.

36077

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

36078

.addUse(SaveRBX); // Save of base pointer.

36079

MI.eraseFromParent();

36080

}

36081

return BB;

36082

}

36083

case TargetOpcode::PREALLOCATED_SETUP: {

36084

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36084, __extension__
__PRETTY_FUNCTION__));

36085

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

36086

MFI->setHasPreallocatedCall(true);

36087

int64_t PreallocatedId = MI.getOperand(0).getImm();

36088

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

36089

assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36089, __extension__
__PRETTY_FUNCTION__));

36090

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

36091

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

36092

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

36093

.addReg(X86::ESP)

36094

.addImm(StackAdjustment);

36095

MI.eraseFromParent();

36096

return BB;

36097

}

36098

case TargetOpcode::PREALLOCATED_ARG: {

36099

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36099, __extension__
__PRETTY_FUNCTION__));

36100

int64_t PreallocatedId = MI.getOperand(1).getImm();

36101

int64_t ArgIdx = MI.getOperand(2).getImm();

36102

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

36103

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

36104

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

36105

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

36106

// stack pointer + offset

36107

addRegOffset(

36108

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

36109

X86::ESP, false, ArgOffset);

36110

MI.eraseFromParent();

36111

return BB;

36112

}

36113

case X86::PTDPBSSD:

36114

case X86::PTDPBSUD:

36115

case X86::PTDPBUSD:

36116

case X86::PTDPBUUD:

36117

case X86::PTDPBF16PS: {

36118

unsigned Opc;

36119

switch (MI.getOpcode()) {

36120

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36120);

36121

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

36122

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

36123

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

36124

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

36125

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

36126

}

36127

36128

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

36129

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

36130

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

36131

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

36132

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

36133

36134

MI.eraseFromParent(); // The pseudo is gone now.

36135

return BB;

36136

}

36137

case X86::PTILEZERO: {

36138

unsigned Imm = MI.getOperand(0).getImm();

36139

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

36140

MI.eraseFromParent(); // The pseudo is gone now.

36141

return BB;

36142

}

36143

case X86::PTILELOADD:

36144

case X86::PTILELOADDT1:

36145

case X86::PTILESTORED: {

36146

unsigned Opc;

36147

switch (MI.getOpcode()) {

36148

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36148);

36149

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

36150

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

36151

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

36152

}

36153

36154

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

36155

unsigned CurOp = 0;

36156

if (Opc != X86::TILESTORED)

36157

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

36158

RegState::Define);

36159

36160

MIB.add(MI.getOperand(CurOp++)); // base

36161

MIB.add(MI.getOperand(CurOp++)); // scale

36162

MIB.add(MI.getOperand(CurOp++)); // index -- stride

36163

MIB.add(MI.getOperand(CurOp++)); // displacement

36164

MIB.add(MI.getOperand(CurOp++)); // segment

36165

36166

if (Opc == X86::TILESTORED)

36167

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

36168

RegState::Undef);

36169

36170

MI.eraseFromParent(); // The pseudo is gone now.

36171

return BB;

36172

}

36173

}

36174

}

36175

36176

//===----------------------------------------------------------------------===//

36177

// X86 Optimization Hooks

36178

//===----------------------------------------------------------------------===//

36179

36180

bool

36181

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

36182

const APInt &DemandedBits,

36183

const APInt &DemandedElts,

36184

TargetLoweringOpt &TLO) const {

36185

EVT VT = Op.getValueType();

36186

unsigned Opcode = Op.getOpcode();

36187

unsigned EltSize = VT.getScalarSizeInBits();

36188

36189

if (VT.isVector()) {

36190

// If the constant is only all signbits in the active bits, then we should

36191

// extend it to the entire constant to allow it act as a boolean constant

36192

// vector.

36193

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

36194

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

36195

return false;

36196

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

36197

if (!DemandedElts[i] || V.getOperand(i).isUndef())

36198

continue;

36199

const APInt &Val = V.getConstantOperandAPInt(i);

36200

if (Val.getBitWidth() > Val.getNumSignBits() &&

36201

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

36202

return true;

36203

}

36204

return false;

36205

};

36206

// For vectors - if we have a constant, then try to sign extend.

36207

// TODO: Handle AND/ANDN cases.

36208

unsigned ActiveBits = DemandedBits.getActiveBits();

36209

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

36210

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

36211

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

36212

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

36213

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

36214

VT.getVectorNumElements());

36215

SDValue NewC =

36216

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

36217

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

36218

SDValue NewOp =

36219

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

36220

return TLO.CombineTo(Op, NewOp);

36221

}

36222

return false;

36223

}

36224

36225

// Only optimize Ands to prevent shrinking a constant that could be

36226

// matched by movzx.

36227

if (Opcode != ISD::AND)

36228

return false;

36229

36230

// Make sure the RHS really is a constant.

36231

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

36232

if (!C)

36233

return false;

36234

36235

const APInt &Mask = C->getAPIntValue();

36236

36237

// Clear all non-demanded bits initially.

36238

APInt ShrunkMask = Mask & DemandedBits;

36239

36240

// Find the width of the shrunk mask.

36241

unsigned Width = ShrunkMask.getActiveBits();

36242

36243

// If the mask is all 0s there's nothing to do here.

36244

if (Width == 0)

36245

return false;

36246

36247

// Find the next power of 2 width, rounding up to a byte.

36248

Width = PowerOf2Ceil(std::max(Width, 8U));

36249

// Truncate the width to size to handle illegal types.

36250

Width = std::min(Width, EltSize);

36251

36252

// Calculate a possible zero extend mask for this constant.

36253

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

36254

36255

// If we aren't changing the mask, just return true to keep it and prevent

36256

// the caller from optimizing.

36257

if (ZeroExtendMask == Mask)

36258

return true;

36259

36260

// Make sure the new mask can be represented by a combination of mask bits

36261

// and non-demanded bits.

36262

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

36263

return false;

36264

36265

// Replace the constant with the zero extend mask.

36266

SDLoc DL(Op);

36267

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

36268

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

36269

return TLO.CombineTo(Op, NewOp);

36270

}

36271

36272

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

36273

KnownBits &Known,

36274

const APInt &DemandedElts,

36275

const SelectionDAG &DAG,

36276

unsigned Depth) const {

36277

unsigned BitWidth = Known.getBitWidth();

36278

unsigned NumElts = DemandedElts.getBitWidth();

36279

unsigned Opc = Op.getOpcode();

36280

EVT VT = Op.getValueType();

36281

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))

36282

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))

36283

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))

36284

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))

36285

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__))

36286

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36286, __extension__
__PRETTY_FUNCTION__));

36287

36288

Known.resetAll();

36289

switch (Opc) {

36290

default: break;

36291

case X86ISD::SETCC:

36292

Known.Zero.setBitsFrom(1);

36293

break;

36294

case X86ISD::MOVMSK: {

36295

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

36296

Known.Zero.setBitsFrom(NumLoBits);

36297

break;

36298

}

36299

case X86ISD::PEXTRB:

36300

case X86ISD::PEXTRW: {

36301

SDValue Src = Op.getOperand(0);

36302

EVT SrcVT = Src.getValueType();

36303

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

36304

Op.getConstantOperandVal(1));

36305

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

36306

Known = Known.anyextOrTrunc(BitWidth);

36307

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

36308

break;

36309

}

36310

case X86ISD::VSRAI:

36311

case X86ISD::VSHLI:

36312

case X86ISD::VSRLI: {

36313

unsigned ShAmt = Op.getConstantOperandVal(1);

36314

if (ShAmt >= VT.getScalarSizeInBits()) {

36315

Known.setAllZero();

36316

break;

36317

}

36318

36319

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

36320

if (Opc == X86ISD::VSHLI) {

36321

Known.Zero <<= ShAmt;

36322

Known.One <<= ShAmt;

36323

// Low bits are known zero.

36324

Known.Zero.setLowBits(ShAmt);

36325

} else if (Opc == X86ISD::VSRLI) {

36326

Known.Zero.lshrInPlace(ShAmt);

36327

Known.One.lshrInPlace(ShAmt);

36328

// High bits are known zero.

36329

Known.Zero.setHighBits(ShAmt);

36330

} else {

36331

Known.Zero.ashrInPlace(ShAmt);

36332

Known.One.ashrInPlace(ShAmt);

36333

}

36334

break;

36335

}

36336

case X86ISD::PACKUS: {

36337

// PACKUS is just a truncation if the upper half is zero.

36338

APInt DemandedLHS, DemandedRHS;

36339

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

36340

36341

Known.One = APInt::getAllOnes(BitWidth * 2);

36342

Known.Zero = APInt::getAllOnes(BitWidth * 2);

36343

36344

KnownBits Known2;

36345

if (!!DemandedLHS) {

36346

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

36347

Known = KnownBits::commonBits(Known, Known2);

36348

}

36349

if (!!DemandedRHS) {

36350

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

36351

Known = KnownBits::commonBits(Known, Known2);

36352

}

36353

36354

if (Known.countMinLeadingZeros() < BitWidth)

36355

Known.resetAll();

36356

Known = Known.trunc(BitWidth);

36357

break;

36358

}

36359

case X86ISD::VBROADCAST: {

36360

SDValue Src = Op.getOperand(0);

36361

if (!Src.getSimpleValueType().isVector()) {

36362

Known = DAG.computeKnownBits(Src, Depth + 1);

36363

return;

36364

}

36365

break;

36366

}

36367

case X86ISD::AND: {

36368

if (Op.getResNo() == 0) {

36369

KnownBits Known2;

36370

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

36371

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

36372

Known &= Known2;

36373

}

36374

break;

36375

}

36376

case X86ISD::ANDNP: {

36377

KnownBits Known2;

36378

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

36379

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

36380

36381

// ANDNP = (~X & Y);

36382

Known.One &= Known2.Zero;

36383

Known.Zero |= Known2.One;

36384

break;

36385

}

36386

case X86ISD::FOR: {

36387

KnownBits Known2;

36388

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

36389

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

36390

36391

Known |= Known2;

36392

break;

36393

}

36394

case X86ISD::PSADBW: {

36395

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))

36396

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))

36397

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__));

36398

36399

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

36400

Known.Zero.setBitsFrom(16);

36401

break;

36402

}

36403

case X86ISD::PMULUDQ: {

36404

KnownBits Known2;

36405

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

36406

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

36407

36408

Known = Known.trunc(BitWidth / 2).zext(BitWidth);

36409

Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

36410

Known = KnownBits::mul(Known, Known2);

36411

break;

36412

}

36413

case X86ISD::CMOV: {

36414

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

36415

// If we don't know any bits, early out.

36416

if (Known.isUnknown())

36417

break;

36418

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

36419

36420

// Only known if known in both the LHS and RHS.

36421

Known = KnownBits::commonBits(Known, Known2);

36422

break;

36423

}

36424

case X86ISD::BEXTR:

36425

case X86ISD::BEXTRI: {

36426

SDValue Op0 = Op.getOperand(0);

36427

SDValue Op1 = Op.getOperand(1);

36428

36429

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

36430

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

36431

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

36432

36433

// If the length is 0, the result is 0.

36434

if (Length == 0) {

36435

Known.setAllZero();

36436

break;

36437

}

36438

36439

if ((Shift + Length) <= BitWidth) {

36440

Known = DAG.computeKnownBits(Op0, Depth + 1);

36441

Known = Known.extractBits(Length, Shift);

36442

Known = Known.zextOrTrunc(BitWidth);

36443

}

36444

}

36445

break;

36446

}

36447

case X86ISD::PDEP: {

36448

KnownBits Known2;

36449

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

36450

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

36451

// Zeros are retained from the mask operand. But not ones.

36452

Known.One.clearAllBits();

36453

// The result will have at least as many trailing zeros as the non-mask

36454

// operand since bits can only map to the same or higher bit position.

36455

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

36456

break;

36457

}

36458

case X86ISD::PEXT: {

36459

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

36460

// The result has as many leading zeros as the number of zeroes in the mask.

36461

unsigned Count = Known.Zero.countPopulation();

36462

Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

36463

Known.One.clearAllBits();

36464

break;

36465

}

36466

case X86ISD::VTRUNC:

36467

case X86ISD::VTRUNCS:

36468

case X86ISD::VTRUNCUS:

36469

case X86ISD::CVTSI2P:

36470

case X86ISD::CVTUI2P:

36471

case X86ISD::CVTP2SI:

36472

case X86ISD::CVTP2UI:

36473

case X86ISD::MCVTP2SI:

36474

case X86ISD::MCVTP2UI:

36475

case X86ISD::CVTTP2SI:

36476

case X86ISD::CVTTP2UI:

36477

case X86ISD::MCVTTP2SI:

36478

case X86ISD::MCVTTP2UI:

36479

case X86ISD::MCVTSI2P:

36480

case X86ISD::MCVTUI2P:

36481

case X86ISD::VFPROUND:

36482

case X86ISD::VMFPROUND:

36483

case X86ISD::CVTPS2PH:

36484

case X86ISD::MCVTPS2PH: {

36485

// Truncations/Conversions - upper elements are known zero.

36486

EVT SrcVT = Op.getOperand(0).getValueType();

36487

if (SrcVT.isVector()) {

36488

unsigned NumSrcElts = SrcVT.getVectorNumElements();

36489

if (NumElts > NumSrcElts &&

36490

DemandedElts.countTrailingZeros() >= NumSrcElts)

36491

Known.setAllZero();

36492

}

36493

break;

36494

}

36495

case X86ISD::STRICT_CVTTP2SI:

36496

case X86ISD::STRICT_CVTTP2UI:

36497

case X86ISD::STRICT_CVTSI2P:

36498

case X86ISD::STRICT_CVTUI2P:

36499

case X86ISD::STRICT_VFPROUND:

36500

case X86ISD::STRICT_CVTPS2PH: {

36501

// Strict Conversions - upper elements are known zero.

36502

EVT SrcVT = Op.getOperand(1).getValueType();

36503

if (SrcVT.isVector()) {

36504

unsigned NumSrcElts = SrcVT.getVectorNumElements();

36505

if (NumElts > NumSrcElts &&

36506

DemandedElts.countTrailingZeros() >= NumSrcElts)

36507

Known.setAllZero();

36508

}

36509

break;

36510

}

36511

case X86ISD::MOVQ2DQ: {

36512

// Move from MMX to XMM. Upper half of XMM should be 0.

36513

if (DemandedElts.countTrailingZeros() >= (NumElts / 2))

36514

Known.setAllZero();

36515

break;

36516

}

36517

}

36518

36519

// Handle target shuffles.

36520

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

36521

if (isTargetShuffle(Opc)) {

36522

SmallVector<int, 64> Mask;

36523

SmallVector<SDValue, 2> Ops;

36524

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

36525

unsigned NumOps = Ops.size();

36526

unsigned NumElts = VT.getVectorNumElements();

36527

if (Mask.size() == NumElts) {

36528

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

36529

Known.Zero.setAllBits(); Known.One.setAllBits();

36530

for (unsigned i = 0; i != NumElts; ++i) {

36531

if (!DemandedElts[i])

36532

continue;

36533

int M = Mask[i];

36534

if (M == SM_SentinelUndef) {

36535

// For UNDEF elements, we don't know anything about the common state

36536

// of the shuffle result.

36537

Known.resetAll();

36538

break;

36539

}

36540

if (M == SM_SentinelZero) {

36541

Known.One.clearAllBits();

36542

continue;

36543

}

36544

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36545, __extension__
__PRETTY_FUNCTION__))

36545

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36545, __extension__
__PRETTY_FUNCTION__));

36546

36547

unsigned OpIdx = (unsigned)M / NumElts;

36548

unsigned EltIdx = (unsigned)M % NumElts;

36549

if (Ops[OpIdx].getValueType() != VT) {

36550

// TODO - handle target shuffle ops with different value types.

36551

Known.resetAll();

36552

break;

36553

}

36554

DemandedOps[OpIdx].setBit(EltIdx);

36555

}

36556

// Known bits are the values that are shared by every demanded element.

36557

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

36558

if (!DemandedOps[i])

36559

continue;

36560

KnownBits Known2 =

36561

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

36562

Known = KnownBits::commonBits(Known, Known2);

36563

}

36564

}

36565

}

36566

}

36567

}

36568

36569

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

36570

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

36571

unsigned Depth) const {

36572

EVT VT = Op.getValueType();

36573

unsigned VTBits = VT.getScalarSizeInBits();

36574

unsigned Opcode = Op.getOpcode();

36575

switch (Opcode) {

36576

case X86ISD::SETCC_CARRY:

36577

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

36578

return VTBits;

36579

36580

case X86ISD::VTRUNC: {

36581

SDValue Src = Op.getOperand(0);

36582

MVT SrcVT = Src.getSimpleValueType();

36583

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

36584

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36584, __extension__
__PRETTY_FUNCTION__));

36585

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

36586

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

36587

if (Tmp > (NumSrcBits - VTBits))

36588

return Tmp - (NumSrcBits - VTBits);

36589

return 1;

36590

}

36591

36592

case X86ISD::PACKSS: {

36593

// PACKSS is just a truncation if the sign bits extend to the packed size.

36594

APInt DemandedLHS, DemandedRHS;

36595

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

36596

DemandedRHS);

36597

36598

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

36599

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

36600

if (!!DemandedLHS)

36601

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

36602

if (!!DemandedRHS)

36603

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

36604

unsigned Tmp = std::min(Tmp0, Tmp1);

36605

if (Tmp > (SrcBits - VTBits))

36606

return Tmp - (SrcBits - VTBits);

36607

return 1;

36608

}

36609

36610

case X86ISD::VBROADCAST: {

36611

SDValue Src = Op.getOperand(0);

36612

if (!Src.getSimpleValueType().isVector())

36613

return DAG.ComputeNumSignBits(Src, Depth + 1);

36614

break;

36615

}

36616

36617

case X86ISD::VSHLI: {

36618

SDValue Src = Op.getOperand(0);

36619

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

36620

if (ShiftVal.uge(VTBits))

36621

return VTBits; // Shifted all bits out --> zero.

36622

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

36623

if (ShiftVal.uge(Tmp))

36624

return 1; // Shifted all sign bits out --> unknown.

36625

return Tmp - ShiftVal.getZExtValue();

36626

}

36627

36628

case X86ISD::VSRAI: {

36629

SDValue Src = Op.getOperand(0);

36630

APInt ShiftVal = Op.getConstantOperandAPInt(1);

36631

if (ShiftVal.uge(VTBits - 1))

36632

return VTBits; // Sign splat.

36633

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

36634

ShiftVal += Tmp;

36635

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

36636

}

36637

36638

case X86ISD::FSETCC:

36639

// cmpss/cmpsd return zero/all-bits result values in the bottom element.

36640

if (VT == MVT::f32 || VT == MVT::f64 ||

36641

((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

36642

return VTBits;

36643

break;

36644

36645

case X86ISD::PCMPGT:

36646

case X86ISD::PCMPEQ:

36647

case X86ISD::CMPP:

36648

case X86ISD::VPCOM:

36649

case X86ISD::VPCOMU:

36650

// Vector compares return zero/all-bits result values.

36651

return VTBits;

36652

36653

case X86ISD::ANDNP: {

36654

unsigned Tmp0 =

36655

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

36656

if (Tmp0 == 1) return 1; // Early out.

36657

unsigned Tmp1 =

36658

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

36659

return std::min(Tmp0, Tmp1);

36660

}

36661

36662

case X86ISD::CMOV: {

36663

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

36664

if (Tmp0 == 1) return 1; // Early out.

36665

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

36666

return std::min(Tmp0, Tmp1);

36667

}

36668

}

36669

36670

// Handle target shuffles.

36671

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

36672

if (isTargetShuffle(Opcode)) {

36673

SmallVector<int, 64> Mask;

36674

SmallVector<SDValue, 2> Ops;

36675

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

36676

unsigned NumOps = Ops.size();

36677

unsigned NumElts = VT.getVectorNumElements();

36678

if (Mask.size() == NumElts) {

36679

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

36680

for (unsigned i = 0; i != NumElts; ++i) {

36681

if (!DemandedElts[i])

36682

continue;

36683

int M = Mask[i];

36684

if (M == SM_SentinelUndef) {

36685

// For UNDEF elements, we don't know anything about the common state

36686

// of the shuffle result.

36687

return 1;

36688

} else if (M == SM_SentinelZero) {

36689

// Zero = all sign bits.

36690

continue;

36691

}

36692

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36693, __extension__
__PRETTY_FUNCTION__))

36693

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36693, __extension__
__PRETTY_FUNCTION__));

36694

36695

unsigned OpIdx = (unsigned)M / NumElts;

36696

unsigned EltIdx = (unsigned)M % NumElts;

36697

if (Ops[OpIdx].getValueType() != VT) {

36698

// TODO - handle target shuffle ops with different value types.

36699

return 1;

36700

}

36701

DemandedOps[OpIdx].setBit(EltIdx);

36702

}

36703

unsigned Tmp0 = VTBits;

36704

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

36705

if (!DemandedOps[i])

36706

continue;

36707

unsigned Tmp1 =

36708

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

36709

Tmp0 = std::min(Tmp0, Tmp1);

36710

}

36711

return Tmp0;

36712

}

36713

}

36714

}

36715

36716

// Fallback case.

36717

return 1;

36718

}

36719

36720

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

36721

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

36722

return N->getOperand(0);

36723

return N;

36724

}

36725

36726

// Helper to look for a normal load that can be narrowed into a vzload with the

36727

// specified VT and memory VT. Returns SDValue() on failure.

36728

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

36729

SelectionDAG &DAG) {

36730

// Can't if the load is volatile or atomic.

36731

if (!LN->isSimple())

36732

return SDValue();

36733

36734

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

36735

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

36736

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

36737

LN->getPointerInfo(), LN->getOriginalAlign(),

36738

LN->getMemOperand()->getFlags());

36739

}

36740

36741

// Attempt to match a combined shuffle mask against supported unary shuffle

36742

// instructions.

36743

// TODO: Investigate sharing more of this with shuffle lowering.

36744

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

36745

bool AllowFloatDomain, bool AllowIntDomain,

36746

SDValue V1, const X86Subtarget &Subtarget,

36747

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {

36748

unsigned NumMaskElts = Mask.size();

36749

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

36750

36751

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

36752

if (Mask[0] == 0 &&

36753

(MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

36754

if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

36755

(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

36756

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

36757

Shuffle = X86ISD::VZEXT_MOVL;

36758

if (MaskEltSize == 16)

36759

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

36760

else

36761

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

36762

return true;

36763

}

36764

}

36765

36766

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

36767

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

36768

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

36769

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

36770

unsigned MaxScale = 64 / MaskEltSize;

36771

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

36772

bool MatchAny = true;

36773

bool MatchZero = true;

36774

unsigned NumDstElts = NumMaskElts / Scale;

36775

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

36776

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

36777

MatchAny = MatchZero = false;

36778

break;

36779

}

36780

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

36781

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

36782

}

36783

if (MatchAny || MatchZero) {

36784

assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36784, __extension__
__PRETTY_FUNCTION__));

36785

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

36786

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

36787

MVT::getIntegerVT(MaskEltSize);

36788

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

36789

36790

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

36791

if (SrcVT.getVectorNumElements() != NumDstElts)

36792

Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

36793

36794

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

36795

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

36796

return true;

36797

}

36798

}

36799

}

36800

36801

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

36802

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

36803

(MaskEltSize == 16 && Subtarget.hasFP16())) &&

36804

isUndefOrEqual(Mask[0], 0) &&

36805

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

36806

Shuffle = X86ISD::VZEXT_MOVL;

36807

if (MaskEltSize == 16)

36808

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

36809

else

36810

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

36811

return true;

36812

}

36813

36814

// Check if we have SSE3 which will let us use MOVDDUP etc. The

36815

// instructions are no slower than UNPCKLPD but has the option to

36816

// fold the input operand into even an unaligned memory load.

36817

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

36818

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {

36819

Shuffle = X86ISD::MOVDDUP;

36820

SrcVT = DstVT = MVT::v2f64;

36821

return true;

36822

}

36823

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {

36824

Shuffle = X86ISD::MOVSLDUP;

36825

SrcVT = DstVT = MVT::v4f32;

36826

return true;

36827

}

36828

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {

36829

Shuffle = X86ISD::MOVSHDUP;

36830

SrcVT = DstVT = MVT::v4f32;

36831

return true;

36832

}

36833

}

36834

36835

if (MaskVT.is256BitVector() && AllowFloatDomain) {

36836

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36836, __extension__
__PRETTY_FUNCTION__));

36837

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {

36838

Shuffle = X86ISD::MOVDDUP;

36839

SrcVT = DstVT = MVT::v4f64;

36840

return true;

36841

}

36842

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {

36843

Shuffle = X86ISD::MOVSLDUP;

36844

SrcVT = DstVT = MVT::v8f32;

36845

return true;

36846

}

36847

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {

36848

Shuffle = X86ISD::MOVSHDUP;

36849

SrcVT = DstVT = MVT::v8f32;

36850

return true;

36851

}

36852

}

36853

36854

if (MaskVT.is512BitVector() && AllowFloatDomain) {

36855

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36856, __extension__
__PRETTY_FUNCTION__))

36856

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36856, __extension__
__PRETTY_FUNCTION__));

36857

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {

36858

Shuffle = X86ISD::MOVDDUP;

36859

SrcVT = DstVT = MVT::v8f64;

36860

return true;

36861

}

36862

if (isTargetShuffleEquivalent(

36863

MaskVT, Mask,

36864

{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {

36865

Shuffle = X86ISD::MOVSLDUP;

36866

SrcVT = DstVT = MVT::v16f32;

36867

return true;

36868

}

36869

if (isTargetShuffleEquivalent(

36870

MaskVT, Mask,

36871

{1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {

36872

Shuffle = X86ISD::MOVSHDUP;

36873

SrcVT = DstVT = MVT::v16f32;

36874

return true;

36875

}

36876

}

36877

36878

return false;

36879

}

36880

36881

// Attempt to match a combined shuffle mask against supported unary immediate

36882

// permute instructions.

36883

// TODO: Investigate sharing more of this with shuffle lowering.

36884

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

36885

const APInt &Zeroable,

36886

bool AllowFloatDomain, bool AllowIntDomain,

36887

const X86Subtarget &Subtarget,

36888

unsigned &Shuffle, MVT &ShuffleVT,

36889

unsigned &PermuteImm) {

36890

unsigned NumMaskElts = Mask.size();

36891

unsigned InputSizeInBits = MaskVT.getSizeInBits();

36892

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

36893

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

36894

bool ContainsZeros = isAnyZero(Mask);

36895

36896

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

36897

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

36898

// Check for lane crossing permutes.

36899

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

36900

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

36901

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

36902

Shuffle = X86ISD::VPERMI;

36903

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

36904

PermuteImm = getV4X86ShuffleImm(Mask);

36905

return true;

36906

}

36907

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

36908

SmallVector<int, 4> RepeatedMask;

36909

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

36910

Shuffle = X86ISD::VPERMI;

36911

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

36912

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

36913

return true;

36914

}

36915

}

36916

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

36917

// VPERMILPD can permute with a non-repeating shuffle.

36918

Shuffle = X86ISD::VPERMILPI;

36919

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

36920

PermuteImm = 0;

36921

for (int i = 0, e = Mask.size(); i != e; ++i) {

36922

int M = Mask[i];

36923

if (M == SM_SentinelUndef)

36924

continue;

36925

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36925, __extension__
__PRETTY_FUNCTION__));

36926

PermuteImm |= (M & 1) << i;

36927

}

36928

return true;

36929

}

36930

}

36931

36932

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

36933

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

36934

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

36935

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

36936

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

36937

SmallVector<int, 4> RepeatedMask;

36938

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

36939

// Narrow the repeated mask to create 32-bit element permutes.

36940

SmallVector<int, 4> WordMask = RepeatedMask;

36941

if (MaskScalarSizeInBits == 64)

36942

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

36943

36944

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

36945

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

36946

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

36947

PermuteImm = getV4X86ShuffleImm(WordMask);

36948

return true;

36949

}

36950

}

36951

36952

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

36953

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

36954

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

36955

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

36956

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

36957

SmallVector<int, 4> RepeatedMask;

36958

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

36959

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

36960

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

36961

36962

// PSHUFLW: permute lower 4 elements only.

36963

if (isUndefOrInRange(LoMask, 0, 4) &&

36964

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

36965

Shuffle = X86ISD::PSHUFLW;

36966

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

36967

PermuteImm = getV4X86ShuffleImm(LoMask);

36968

return true;

36969

}

36970

36971

// PSHUFHW: permute upper 4 elements only.

36972

if (isUndefOrInRange(HiMask, 4, 8) &&

36973

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

36974

// Offset the HiMask so that we can create the shuffle immediate.

36975

int OffsetHiMask[4];

36976

for (int i = 0; i != 4; ++i)

36977

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

36978

36979

Shuffle = X86ISD::PSHUFHW;

36980

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

36981

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

36982

return true;

36983

}

36984

}

36985

}

36986

36987

// Attempt to match against byte/bit shifts.

36988

if (AllowIntDomain &&

36989

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

36990

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

36991

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

36992

int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,

36993

Mask, 0, Zeroable, Subtarget);

36994

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

36995

32 <= ShuffleVT.getScalarSizeInBits())) {

36996

PermuteImm = (unsigned)ShiftAmt;

36997

return true;

36998

}

36999

}

37000

37001

// Attempt to match against bit rotates.

37002

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

37003

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

37004

Subtarget.hasAVX512())) {

37005

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

37006

Subtarget, Mask);

37007

if (0 < RotateAmt) {

37008

Shuffle = X86ISD::VROTLI;

37009

PermuteImm = (unsigned)RotateAmt;

37010

return true;

37011

}

37012

}

37013

37014

return false;

37015

}

37016

37017

// Attempt to match a combined unary shuffle mask against supported binary

37018

// shuffle instructions.

37019

// TODO: Investigate sharing more of this with shuffle lowering.

37020

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

37021

bool AllowFloatDomain, bool AllowIntDomain,

37022

SDValue &V1, SDValue &V2, const SDLoc &DL,

37023

SelectionDAG &DAG, const X86Subtarget &Subtarget,

37024

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

37025

bool IsUnary) {

37026

unsigned NumMaskElts = Mask.size();

37027

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

37028

37029

if (MaskVT.is128BitVector()) {

37030

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {

37031

V2 = V1;

37032

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

37033

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

37034

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

37035

return true;

37036

}

37037

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {

37038

V2 = V1;

37039

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

37040

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

37041

return true;

37042

}

37043

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&

37044

Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

37045

std::swap(V1, V2);

37046

Shuffle = X86ISD::MOVSD;

37047

SrcVT = DstVT = MVT::v2f64;

37048

return true;

37049

}

37050

if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&

37051

(AllowFloatDomain || !Subtarget.hasSSE41())) {

37052

Shuffle = X86ISD::MOVSS;

37053

SrcVT = DstVT = MVT::v4f32;

37054

return true;

37055

}

37056

if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&

37057

Subtarget.hasFP16()) {

37058

Shuffle = X86ISD::MOVSH;

37059

SrcVT = DstVT = MVT::v8f16;

37060

return true;

37061

}

37062

}

37063

37064

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

37065

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

37066

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

37067

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

37068

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

37069

Subtarget)) {

37070

DstVT = MaskVT;

37071

return true;

37072

}

37073

}

37074

37075

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

37076

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

37077

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

37078

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

37079

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

37080

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

37081

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

37082

Subtarget)) {

37083

SrcVT = DstVT = MaskVT;

37084

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

37085

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

37086

return true;

37087

}

37088

}

37089

37090

// Attempt to match against a OR if we're performing a blend shuffle and the

37091

// non-blended source element is zero in each case.

37092

if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

37093

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

37094

bool IsBlend = true;

37095

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

37096

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

37097

unsigned Scale1 = NumV1Elts / NumMaskElts;

37098

unsigned Scale2 = NumV2Elts / NumMaskElts;

37099

APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

37100

APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

37101

for (unsigned i = 0; i != NumMaskElts; ++i) {

37102

int M = Mask[i];

37103

if (M == SM_SentinelUndef)

37104

continue;

37105

if (M == SM_SentinelZero) {

37106

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

37107

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

37108

continue;

37109

}

37110

if (M == (int)i) {

37111

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

37112

continue;

37113

}

37114

if (M == (int)(i + NumMaskElts)) {

37115

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

37116

continue;

37117

}

37118

IsBlend = false;

37119

break;

37120

}

37121

if (IsBlend) {

37122

if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&

37123

DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {

37124

Shuffle = ISD::OR;

37125

SrcVT = DstVT = MaskVT.changeTypeToInteger();

37126

return true;

37127

}

37128

if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

37129

// FIXME: handle mismatched sizes?

37130

// TODO: investigate if `ISD::OR` handling in

37131

// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

37132

auto computeKnownBitsElementWise = [&DAG](SDValue V) {

37133

unsigned NumElts = V.getValueType().getVectorNumElements();

37134

KnownBits Known(NumElts);

37135

for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

37136

APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

37137

KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

37138

if (PeepholeKnown.isZero())

37139

Known.Zero.setBit(EltIdx);

37140

if (PeepholeKnown.isAllOnes())

37141

Known.One.setBit(EltIdx);

37142

}

37143

return Known;

37144

};

37145

37146

KnownBits V1Known = computeKnownBitsElementWise(V1);

37147

KnownBits V2Known = computeKnownBitsElementWise(V2);

37148

37149

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

37150

int M = Mask[i];

37151

if (M == SM_SentinelUndef)

37152

continue;

37153

if (M == SM_SentinelZero) {

37154

IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

37155

continue;

37156

}

37157

if (M == (int)i) {

37158

IsBlend &= V2Known.Zero[i] || V1Known.One[i];

37159

continue;

37160

}

37161

if (M == (int)(i + NumMaskElts)) {

37162

IsBlend &= V1Known.Zero[i] || V2Known.One[i];

37163

continue;

37164

}

37165

llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37165);

37166

}

37167

if (IsBlend) {

37168

Shuffle = ISD::OR;

37169

SrcVT = DstVT = MaskVT.changeTypeToInteger();

37170

return true;

37171

}

37172

}

37173

}

37174

}

37175

37176

return false;

37177

}

37178

37179

static bool matchBinaryPermuteShuffle(

37180

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

37181

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

37182

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

37183

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

37184

unsigned NumMaskElts = Mask.size();

37185

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

37186

37187

// Attempt to match against VALIGND/VALIGNQ rotate.

37188

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

37189

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

37190

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

37191

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

37192

if (!isAnyZero(Mask)) {

37193

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

37194

if (0 < Rotation) {

37195

Shuffle = X86ISD::VALIGN;

37196

if (EltSizeInBits == 64)

37197

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

37198

else

37199

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

37200

PermuteImm = Rotation;

37201

return true;

37202

}

37203

}

37204

}

37205

37206

// Attempt to match against PALIGNR byte rotate.

37207

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

37208

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

37209

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

37210

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

37211

if (0 < ByteRotation) {

37212

Shuffle = X86ISD::PALIGNR;

37213

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

37214

PermuteImm = ByteRotation;

37215

return true;

37216

}

37217

}

37218

37219

// Attempt to combine to X86ISD::BLENDI.

37220

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

37221

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

37222

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

37223

uint64_t BlendMask = 0;

37224

bool ForceV1Zero = false, ForceV2Zero = false;

37225

SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());

37226

if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,

37227

ForceV2Zero, BlendMask)) {

37228

if (MaskVT == MVT::v16i16) {

37229

// We can only use v16i16 PBLENDW if the lanes are repeated.

37230

SmallVector<int, 8> RepeatedMask;

37231

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

37232

RepeatedMask)) {

37233

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37234, __extension__
__PRETTY_FUNCTION__))

37234

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37234, __extension__
__PRETTY_FUNCTION__));

37235

PermuteImm = 0;

37236

for (int i = 0; i < 8; ++i)

37237

if (RepeatedMask[i] >= 8)

37238

PermuteImm |= 1 << i;

37239

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

37240

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

37241

Shuffle = X86ISD::BLENDI;

37242

ShuffleVT = MaskVT;

37243

return true;

37244

}

37245

} else {

37246

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

37247

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

37248

PermuteImm = (unsigned)BlendMask;

37249

Shuffle = X86ISD::BLENDI;

37250

ShuffleVT = MaskVT;

37251

return true;

37252

}

37253

}

37254

}

37255

37256

// Attempt to combine to INSERTPS, but only if it has elements that need to

37257

// be set to zero.

37258

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

37259

MaskVT.is128BitVector() && isAnyZero(Mask) &&

37260

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

37261

Shuffle = X86ISD::INSERTPS;

37262

ShuffleVT = MVT::v4f32;

37263

return true;

37264

}

37265

37266

// Attempt to combine to SHUFPD.

37267

if (AllowFloatDomain && EltSizeInBits == 64 &&

37268

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

37269

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

37270

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

37271

bool ForceV1Zero = false, ForceV2Zero = false;

37272

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

37273

PermuteImm, Mask, Zeroable)) {

37274

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

37275

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

37276

Shuffle = X86ISD::SHUFP;

37277

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

37278

return true;

37279

}

37280

}

37281

37282

// Attempt to combine to SHUFPS.

37283

if (AllowFloatDomain && EltSizeInBits == 32 &&

37284

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

37285

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

37286

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

37287

SmallVector<int, 4> RepeatedMask;

37288

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

37289

// Match each half of the repeated mask, to determine if its just

37290

// referencing one of the vectors, is zeroable or entirely undef.

37291

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

37292

int M0 = RepeatedMask[Offset];

37293

int M1 = RepeatedMask[Offset + 1];

37294

37295

if (isUndefInRange(RepeatedMask, Offset, 2)) {

37296

return DAG.getUNDEF(MaskVT);

37297

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

37298

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

37299

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

37300

return getZeroVector(MaskVT, Subtarget, DAG, DL);

37301

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

37302

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

37303

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

37304

return V1;

37305

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

37306

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

37307

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

37308

return V2;

37309

}

37310

37311

return SDValue();

37312

};

37313

37314

int ShufMask[4] = {-1, -1, -1, -1};

37315

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

37316

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

37317

37318

if (Lo && Hi) {

37319

V1 = Lo;

37320

V2 = Hi;

37321

Shuffle = X86ISD::SHUFP;

37322

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

37323

PermuteImm = getV4X86ShuffleImm(ShufMask);

37324

return true;

37325

}

37326

}

37327

}

37328

37329

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

37330

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

37331

MaskVT.is128BitVector() &&

37332

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

37333

Shuffle = X86ISD::INSERTPS;

37334

ShuffleVT = MVT::v4f32;

37335

return true;

37336

}

37337

37338

return false;

37339

}

37340

37341

static SDValue combineX86ShuffleChainWithExtract(

37342

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

37343

bool HasVariableMask, bool AllowVariableCrossLaneMask,

37344

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

37345

const X86Subtarget &Subtarget);

37346

37347

/// Combine an arbitrary chain of shuffles into a single instruction if

37348

/// possible.

37349

///

37350

/// This is the leaf of the recursive combine below. When we have found some

37351

/// chain of single-use x86 shuffle instructions and accumulated the combined

37352

/// shuffle mask represented by them, this will try to pattern match that mask

37353

/// into either a single instruction if there is a special purpose instruction

37354

/// for this operation, or into a PSHUFB instruction which is a fully general

37355

/// instruction but should only be used to replace chains over a certain depth.

37356

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

37357

ArrayRef<int> BaseMask, int Depth,

37358

bool HasVariableMask,

37359

bool AllowVariableCrossLaneMask,

37360

bool AllowVariablePerLaneMask,

37361

SelectionDAG &DAG,

37362

const X86Subtarget &Subtarget) {

37363

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37363, __extension__
__PRETTY_FUNCTION__));

37364

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37365, __extension__
__PRETTY_FUNCTION__))

37365

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37365, __extension__
__PRETTY_FUNCTION__));

37366

37367

SDLoc DL(Root);

37368

MVT RootVT = Root.getSimpleValueType();

37369

unsigned RootSizeInBits = RootVT.getSizeInBits();

37370

unsigned NumRootElts = RootVT.getVectorNumElements();

37371

37372

// Canonicalize shuffle input op to the requested type.

37373

auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

37374

if (VT.getSizeInBits() > Op.getValueSizeInBits())

37375

Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

37376

else if (VT.getSizeInBits() < Op.getValueSizeInBits())

37377

Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

37378

return DAG.getBitcast(VT, Op);

37379

};

37380

37381

// Find the inputs that enter the chain. Note that multiple uses are OK

37382

// here, we're not going to remove the operands we find.

37383

bool UnaryShuffle = (Inputs.size() == 1);

37384

SDValue V1 = peekThroughBitcasts(Inputs[0]);

37385

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

37386

: peekThroughBitcasts(Inputs[1]));

37387

37388

MVT VT1 = V1.getSimpleValueType();

37389

MVT VT2 = V2.getSimpleValueType();

37390

assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37391, __extension__
__PRETTY_FUNCTION__))

37391

(RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37391, __extension__
__PRETTY_FUNCTION__));

37392

37393

SDValue Res;

37394

37395

unsigned NumBaseMaskElts = BaseMask.size();

37396

if (NumBaseMaskElts == 1) {

37397

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__
__PRETTY_FUNCTION__));

37398

return CanonicalizeShuffleInput(RootVT, V1);

37399

}

37400

37401

bool OptForSize = DAG.shouldOptForSize();

37402

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

37403

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

37404

(RootVT.isFloatingPoint() && Depth >= 1) ||

37405

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

37406

37407

// Don't combine if we are a AVX512/EVEX target and the mask element size

37408

// is different from the root element size - this would prevent writemasks

37409

// from being reused.

37410

bool IsMaskedShuffle = false;

37411

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

37412

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

37413

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

37414

IsMaskedShuffle = true;

37415

}

37416

}

37417

37418

// If we are shuffling a broadcast (and not introducing zeros) then

37419

// we can just use the broadcast directly. This works for smaller broadcast

37420

// elements as well as they already repeat across each mask element

37421

if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&

37422

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

37423

V1.getValueSizeInBits() >= RootSizeInBits) {

37424

return CanonicalizeShuffleInput(RootVT, V1);

37425

}

37426

37427

SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());

37428

37429

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs

37430

// etc. can be simplified.

37431

if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

37432

SmallVector<int> ScaledMask, IdentityMask;

37433

unsigned NumElts = VT1.getVectorNumElements();

37434

if (Mask.size() <= NumElts &&

37435

scaleShuffleElements(Mask, NumElts, ScaledMask)) {

37436

for (unsigned i = 0; i != NumElts; ++i)

37437

IdentityMask.push_back(i);

37438

if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))

37439

return CanonicalizeShuffleInput(RootVT, V1);

37440

}

37441

}

37442

37443

// Handle 128/256-bit lane shuffles of 512-bit vectors.

37444

if (RootVT.is512BitVector() &&

37445

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

37446

// If the upper subvectors are zeroable, then an extract+insert is more

37447

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

37448

// to zero the upper subvectors.

37449

if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

37450

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

37451

return SDValue(); // Nothing to do!

37452

assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37453, __extension__
__PRETTY_FUNCTION__))

37453

"Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37453, __extension__
__PRETTY_FUNCTION__));

37454

Res = CanonicalizeShuffleInput(RootVT, V1);

37455

unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

37456

bool UseZero = isAnyZero(Mask);

37457

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

37458

return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

37459

}

37460

37461

// Narrow shuffle mask to v4x128.

37462

SmallVector<int, 4> ScaledMask;

37463

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37463, __extension__
__PRETTY_FUNCTION__));

37464

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);

37465

37466

// Try to lower to vshuf64x2/vshuf32x4.

37467

auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

37468

ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

37469

SelectionDAG &DAG) {

37470

unsigned PermMask = 0;

37471

// Insure elements came from the same Op.

37472

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

37473

for (int i = 0; i < 4; ++i) {

37474

assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37474, __extension__
__PRETTY_FUNCTION__));

37475

if (ScaledMask[i] < 0)

37476

continue;

37477

37478

SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

37479

unsigned OpIndex = i / 2;

37480

if (Ops[OpIndex].isUndef())

37481

Ops[OpIndex] = Op;

37482

else if (Ops[OpIndex] != Op)

37483

return SDValue();

37484

37485

// Convert the 128-bit shuffle mask selection values into 128-bit

37486

// selection bits defined by a vshuf64x2 instruction's immediate control

37487

// byte.

37488

PermMask |= (ScaledMask[i] % 4) << (i * 2);

37489

}

37490

37491

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

37492

CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

37493

CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

37494

DAG.getTargetConstant(PermMask, DL, MVT::i8));

37495

};

37496

37497

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

37498

// doesn't work because our mask is for 128 bits and we don't have an MVT

37499

// to match that.

37500

bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&

37501

isUndefOrInRange(ScaledMask[1], 0, 2) &&

37502

isUndefOrInRange(ScaledMask[2], 2, 4) &&

37503

isUndefOrInRange(ScaledMask[3], 2, 4) &&

37504

(ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

37505

ScaledMask[0] == (ScaledMask[2] % 2)) &&

37506

(ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

37507

ScaledMask[1] == (ScaledMask[3] % 2));

37508

37509

if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

37510

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

37511

return SDValue(); // Nothing to do!

37512

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

37513

if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

37514

return DAG.getBitcast(RootVT, V);

37515

}

37516

}

37517

37518

// Handle 128-bit lane shuffles of 256-bit vectors.

37519

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

37520

// If the upper half is zeroable, then an extract+insert is more optimal

37521

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

37522

// zero the upper half.

37523

if (isUndefOrZero(Mask[1])) {

37524

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

37525

return SDValue(); // Nothing to do!

37526

assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37526, __extension__
__PRETTY_FUNCTION__));

37527

Res = CanonicalizeShuffleInput(RootVT, V1);

37528

Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

37529

return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

37530

256);

37531

}

37532

37533

// If we're inserting the low subvector, an insert-subvector 'concat'

37534

// pattern is quicker than VPERM2X128.

37535

// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.

37536

if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

37537

!Subtarget.hasAVX2()) {

37538

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

37539

return SDValue(); // Nothing to do!

37540

SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

37541

SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

37542

Hi = extractSubVector(Hi, 0, DAG, DL, 128);

37543

return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

37544

}

37545

37546

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

37547

return SDValue(); // Nothing to do!

37548

37549

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

37550

// we need to use the zeroing feature.

37551

// Prefer blends for sequential shuffles unless we are optimizing for size.

37552

if (UnaryShuffle &&

37553

!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

37554

(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

37555

unsigned PermMask = 0;

37556

PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

37557

PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

37558

return DAG.getNode(

37559

X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

37560

DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

37561

}

37562

37563

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

37564

return SDValue(); // Nothing to do!

37565

37566

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

37567

if (!UnaryShuffle && !IsMaskedShuffle) {

37568

assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37569, __extension__
__PRETTY_FUNCTION__))

37569

"Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37569, __extension__
__PRETTY_FUNCTION__));

37570

// Prefer blends to X86ISD::VPERM2X128.

37571

if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

37572

unsigned PermMask = 0;

37573

PermMask |= ((Mask[0] & 3) << 0);

37574

PermMask |= ((Mask[1] & 3) << 4);

37575

SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

37576

SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

37577

return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

37578

CanonicalizeShuffleInput(RootVT, LHS),

37579

CanonicalizeShuffleInput(RootVT, RHS),

37580

DAG.getTargetConstant(PermMask, DL, MVT::i8));

37581

}

37582

}

37583

}

37584

37585

// For masks that have been widened to 128-bit elements or more,

37586

// narrow back down to 64-bit elements.

37587

if (BaseMaskEltSizeInBits > 64) {

37588

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37588, __extension__
__PRETTY_FUNCTION__));

37589

int MaskScale = BaseMaskEltSizeInBits / 64;

37590

SmallVector<int, 64> ScaledMask;

37591

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

37592

Mask = std::move(ScaledMask);

37593

}

37594

37595

// For masked shuffles, we're trying to match the root width for better

37596

// writemask folding, attempt to scale the mask.

37597

// TODO - variable shuffles might need this to be widened again.

37598

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

37599

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37599, __extension__
__PRETTY_FUNCTION__));

37600

int MaskScale = NumRootElts / Mask.size();

37601

SmallVector<int, 64> ScaledMask;

37602

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

37603

Mask = std::move(ScaledMask);

37604

}

37605

37606

unsigned NumMaskElts = Mask.size();

37607

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

37608

37609

// Determine the effective mask value type.

37610

FloatDomain &= (32 <= MaskEltSizeInBits);

37611

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

37612

: MVT::getIntegerVT(MaskEltSizeInBits);

37613

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

37614

37615

// Only allow legal mask types.

37616

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

37617

return SDValue();

37618

37619

// Attempt to match the mask against known shuffle patterns.

37620

MVT ShuffleSrcVT, ShuffleVT;

37621

unsigned Shuffle, PermuteImm;

37622

37623

// Which shuffle domains are permitted?

37624

// Permit domain crossing at higher combine depths.

37625

// TODO: Should we indicate which domain is preferred if both are allowed?

37626

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

37627

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

37628

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

37629

37630

// Determine zeroable mask elements.

37631

APInt KnownUndef, KnownZero;

37632

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

37633

APInt Zeroable = KnownUndef | KnownZero;

37634

37635

if (UnaryShuffle) {

37636

// Attempt to match against broadcast-from-vector.

37637

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

37638

if ((Subtarget.hasAVX2() ||

37639

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

37640

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

37641

if (isUndefOrEqual(Mask, 0)) {

37642

if (V1.getValueType() == MaskVT &&

37643

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

37644

X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

37645

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

37646

return SDValue(); // Nothing to do!

37647

Res = V1.getOperand(0);

37648

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

37649

return DAG.getBitcast(RootVT, Res);

37650

}

37651

if (Subtarget.hasAVX2()) {

37652

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

37653

return SDValue(); // Nothing to do!

37654

Res = CanonicalizeShuffleInput(MaskVT, V1);

37655

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

37656

return DAG.getBitcast(RootVT, Res);

37657

}

37658

}

37659

}

37660

37661

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

37662

Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

37663

(!IsMaskedShuffle ||

37664

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

37665

if (Depth == 0 && Root.getOpcode() == Shuffle)

37666

return SDValue(); // Nothing to do!

37667

Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

37668

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

37669

return DAG.getBitcast(RootVT, Res);

37670

}

37671

37672

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

37673

AllowIntDomain, Subtarget, Shuffle, ShuffleVT,

37674

PermuteImm) &&

37675

(!IsMaskedShuffle ||

37676

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

37677

if (Depth == 0 && Root.getOpcode() == Shuffle)

37678

return SDValue(); // Nothing to do!

37679

Res = CanonicalizeShuffleInput(ShuffleVT, V1);

37680

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

37681

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

37682

return DAG.getBitcast(RootVT, Res);

37683

}

37684

}

37685

37686

// Attempt to combine to INSERTPS, but only if the inserted element has come

37687

// from a scalar.

37688

// TODO: Handle other insertions here as well?

37689

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

37690

Subtarget.hasSSE41() &&

37691

!isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {

37692

if (MaskEltSizeInBits == 32) {

37693

SDValue SrcV1 = V1, SrcV2 = V2;

37694

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

37695

DAG) &&

37696

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

37697

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

37698

return SDValue(); // Nothing to do!

37699

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

37700

CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

37701

CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

37702

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

37703

return DAG.getBitcast(RootVT, Res);

37704

}

37705

}

37706

if (MaskEltSizeInBits == 64 &&

37707

isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&

37708

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

37709

V2.getScalarValueSizeInBits() <= 32) {

37710

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

37711

return SDValue(); // Nothing to do!

37712

PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);

37713

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

37714

CanonicalizeShuffleInput(MVT::v4f32, V1),

37715

CanonicalizeShuffleInput(MVT::v4f32, V2),

37716

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

37717

return DAG.getBitcast(RootVT, Res);

37718

}

37719

}

37720

37721

SDValue NewV1 = V1; // Save operands in case early exit happens.

37722

SDValue NewV2 = V2;

37723

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

37724

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

37725

ShuffleVT, UnaryShuffle) &&

37726

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

37727

if (Depth == 0 && Root.getOpcode() == Shuffle)

37728

return SDValue(); // Nothing to do!

37729

NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

37730

NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

37731

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

37732

return DAG.getBitcast(RootVT, Res);

37733

}

37734

37735

NewV1 = V1; // Save operands in case early exit happens.

37736

NewV2 = V2;

37737

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

37738

AllowIntDomain, NewV1, NewV2, DL, DAG,

37739

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

37740

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

37741

if (Depth == 0 && Root.getOpcode() == Shuffle)

37742

return SDValue(); // Nothing to do!

37743

NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

37744

NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

37745

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

37746

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

37747

return DAG.getBitcast(RootVT, Res);

37748

}

37749

37750

// Typically from here on, we need an integer version of MaskVT.

37751

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

37752

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

37753

37754

// Annoyingly, SSE4A instructions don't map into the above match helpers.

37755

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

37756

uint64_t BitLen, BitIdx;

37757

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

37758

Zeroable)) {

37759

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

37760

return SDValue(); // Nothing to do!

37761

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

37762

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

37763

DAG.getTargetConstant(BitLen, DL, MVT::i8),

37764

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

37765

return DAG.getBitcast(RootVT, Res);

37766

}

37767

37768

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

37769

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

37770

return SDValue(); // Nothing to do!

37771

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

37772

V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

37773

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

37774

DAG.getTargetConstant(BitLen, DL, MVT::i8),

37775

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

37776

return DAG.getBitcast(RootVT, Res);

37777

}

37778

}

37779

37780

// Match shuffle against TRUNCATE patterns.

37781

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

37782

// Match against a VTRUNC instruction, accounting for src/dst sizes.

37783

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

37784

Subtarget)) {

37785

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

37786

ShuffleSrcVT.getVectorNumElements();

37787

unsigned Opc =

37788

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

37789

if (Depth == 0 && Root.getOpcode() == Opc)

37790

return SDValue(); // Nothing to do!

37791

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

37792

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

37793

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

37794

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

37795

return DAG.getBitcast(RootVT, Res);

37796

}

37797

37798

// Do we need a more general binary truncation pattern?

37799

if (RootSizeInBits < 512 &&

37800

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

37801

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

37802

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

37803

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

37804

if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)

37805

return SDValue(); // Nothing to do!

37806

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

37807

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

37808

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

37809

V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

37810

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

37811

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

37812

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

37813

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

37814

return DAG.getBitcast(RootVT, Res);

37815

}

37816

}

37817

37818

// Don't try to re-form single instruction chains under any circumstances now

37819

// that we've done encoding canonicalization for them.

37820

if (Depth < 1)

37821

return SDValue();

37822

37823

// Depth threshold above which we can efficiently use variable mask shuffles.

37824

int VariableCrossLaneShuffleDepth =

37825

Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

37826

int VariablePerLaneShuffleDepth =

37827

Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

37828

AllowVariableCrossLaneMask &=

37829

(Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;

37830

AllowVariablePerLaneMask &=

37831

(Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;

37832

// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a

37833

// higher depth before combining them.

37834

bool AllowBWIVPERMV3 =

37835

(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);

37836

37837

bool MaskContainsZeros = isAnyZero(Mask);

37838

37839

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

37840

// If we have a single input lane-crossing shuffle then lower to VPERMV.

37841

if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

37842

if (Subtarget.hasAVX2() &&

37843

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

37844

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

37845

Res = CanonicalizeShuffleInput(MaskVT, V1);

37846

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

37847

return DAG.getBitcast(RootVT, Res);

37848

}

37849

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

37850

if ((Subtarget.hasAVX512() &&

37851

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

37852

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

37853

(Subtarget.hasBWI() &&

37854

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

37855

(Subtarget.hasVBMI() &&

37856

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

37857

V1 = CanonicalizeShuffleInput(MaskVT, V1);

37858

V2 = DAG.getUNDEF(MaskVT);

37859

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

37860

return DAG.getBitcast(RootVT, Res);

37861

}

37862

}

37863

37864

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

37865

// vector as the second source (non-VLX will pad to 512-bit shuffles).

37866

if (UnaryShuffle && AllowVariableCrossLaneMask &&

37867

((Subtarget.hasAVX512() &&

37868

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

37869

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

37870

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

37871

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

37872

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

37873

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

37874

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

37875

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

37876

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

37877

for (unsigned i = 0; i != NumMaskElts; ++i)

37878

if (Mask[i] == SM_SentinelZero)

37879

Mask[i] = NumMaskElts + i;

37880

V1 = CanonicalizeShuffleInput(MaskVT, V1);

37881

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

37882

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

37883

return DAG.getBitcast(RootVT, Res);

37884

}

37885

37886

// If that failed and either input is extracted then try to combine as a

37887

// shuffle with the larger type.

37888

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

37889

Inputs, Root, BaseMask, Depth, HasVariableMask,

37890

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,

37891

Subtarget))

37892

return WideShuffle;

37893

37894

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

37895

// (non-VLX will pad to 512-bit shuffles).

37896

if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

37897

((Subtarget.hasAVX512() &&

37898

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

37899

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

37900

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

37901

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

37902

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

37903

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

37904

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

37905

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

37906

V1 = CanonicalizeShuffleInput(MaskVT, V1);

37907

V2 = CanonicalizeShuffleInput(MaskVT, V2);

37908

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

37909

return DAG.getBitcast(RootVT, Res);

37910

}

37911

return SDValue();

37912

}

37913

37914

// See if we can combine a single input shuffle with zeros to a bit-mask,

37915

// which is much simpler than any shuffle.

37916

if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

37917

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

37918

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

37919

APInt Zero = APInt::getZero(MaskEltSizeInBits);

37920

APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

37921

APInt UndefElts(NumMaskElts, 0);

37922

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

37923

for (unsigned i = 0; i != NumMaskElts; ++i) {

37924

int M = Mask[i];

37925

if (M == SM_SentinelUndef) {

37926

UndefElts.setBit(i);

37927

continue;

37928

}

37929

if (M == SM_SentinelZero)

37930

continue;

37931

EltBits[i] = AllOnes;

37932

}

37933

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

37934

Res = CanonicalizeShuffleInput(MaskVT, V1);

37935

unsigned AndOpcode =

37936

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

37937

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

37938

return DAG.getBitcast(RootVT, Res);

37939

}

37940

37941

// If we have a single input shuffle with different shuffle patterns in the

37942

// the 128-bit lanes use the variable mask to VPERMILPS.

37943

// TODO Combine other mask types at higher depths.

37944

if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

37945

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

37946

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

37947

SmallVector<SDValue, 16> VPermIdx;

37948

for (int M : Mask) {

37949

SDValue Idx =

37950

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

37951

VPermIdx.push_back(Idx);

37952

}

37953

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

37954

Res = CanonicalizeShuffleInput(MaskVT, V1);

37955

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

37956

return DAG.getBitcast(RootVT, Res);

37957

}

37958

37959

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

37960

// to VPERMIL2PD/VPERMIL2PS.

37961

if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

37962

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

37963

MaskVT == MVT::v8f32)) {

37964

// VPERMIL2 Operation.

37965

// Bits[3] - Match Bit.

37966

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

37967

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

37968

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

37969

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

37970

SmallVector<int, 8> VPerm2Idx;

37971

unsigned M2ZImm = 0;

37972

for (int M : Mask) {

37973

if (M == SM_SentinelUndef) {

37974

VPerm2Idx.push_back(-1);

37975

continue;

37976

}

37977

if (M == SM_SentinelZero) {

37978

M2ZImm = 2;

37979

VPerm2Idx.push_back(8);

37980

continue;

37981

}

37982

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

37983

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

37984

VPerm2Idx.push_back(Index);

37985

}

37986

V1 = CanonicalizeShuffleInput(MaskVT, V1);

37987

V2 = CanonicalizeShuffleInput(MaskVT, V2);

37988

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

37989

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

37990

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

37991

return DAG.getBitcast(RootVT, Res);

37992

}

37993

37994

// If we have 3 or more shuffle instructions or a chain involving a variable

37995

// mask, we can replace them with a single PSHUFB instruction profitably.

37996

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

37997

// instructions, but in practice PSHUFB tends to be *very* fast so we're

37998

// more aggressive.

37999

if (UnaryShuffle && AllowVariablePerLaneMask &&

38000

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

38001

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

38002

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

38003

SmallVector<SDValue, 16> PSHUFBMask;

38004

int NumBytes = RootVT.getSizeInBits() / 8;

38005

int Ratio = NumBytes / NumMaskElts;

38006

for (int i = 0; i < NumBytes; ++i) {

38007

int M = Mask[i / Ratio];

38008

if (M == SM_SentinelUndef) {

38009

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

38010

continue;

38011

}

38012

if (M == SM_SentinelZero) {

38013

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

38014

continue;

38015

}

38016

M = Ratio * M + i % Ratio;

38017

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38017, __extension__
__PRETTY_FUNCTION__));

38018

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

38019

}

38020

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

38021

Res = CanonicalizeShuffleInput(ByteVT, V1);

38022

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

38023

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

38024

return DAG.getBitcast(RootVT, Res);

38025

}

38026

38027

// With XOP, if we have a 128-bit binary input shuffle we can always combine

38028

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

38029

// slower than PSHUFB on targets that support both.

38030

if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

38031

Subtarget.hasXOP()) {

38032

// VPPERM Mask Operation

38033

// Bits[4:0] - Byte Index (0 - 31)

38034

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

38035

SmallVector<SDValue, 16> VPPERMMask;

38036

int NumBytes = 16;

38037

int Ratio = NumBytes / NumMaskElts;

38038

for (int i = 0; i < NumBytes; ++i) {

38039

int M = Mask[i / Ratio];

38040

if (M == SM_SentinelUndef) {

38041

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

38042

continue;

38043

}

38044

if (M == SM_SentinelZero) {

38045

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

38046

continue;

38047

}

38048

M = Ratio * M + i % Ratio;

38049

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

38050

}

38051

MVT ByteVT = MVT::v16i8;

38052

V1 = CanonicalizeShuffleInput(ByteVT, V1);

38053

V2 = CanonicalizeShuffleInput(ByteVT, V2);

38054

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

38055

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

38056

return DAG.getBitcast(RootVT, Res);

38057

}

38058

38059

// If that failed and either input is extracted then try to combine as a

38060

// shuffle with the larger type.

38061

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

38062

Inputs, Root, BaseMask, Depth, HasVariableMask,

38063

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))

38064

return WideShuffle;

38065

38066

// If we have a dual input shuffle then lower to VPERMV3,

38067

// (non-VLX will pad to 512-bit shuffles)

38068

if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

38069

((Subtarget.hasAVX512() &&

38070

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

38071

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

38072

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

38073

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

38074

MaskVT == MVT::v16i32)) ||

38075

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

38076

(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

38077

MaskVT == MVT::v32i16)) ||

38078

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

38079

(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

38080

MaskVT == MVT::v64i8)))) {

38081

V1 = CanonicalizeShuffleInput(MaskVT, V1);

38082

V2 = CanonicalizeShuffleInput(MaskVT, V2);

38083

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

38084

return DAG.getBitcast(RootVT, Res);

38085

}

38086

38087

// Failed to find any combines.

38088

return SDValue();

38089

}

38090

38091

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

38092

// instruction if possible.

38093

//

38094

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

38095

// type size to attempt to combine:

38096

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

38097

// -->

38098

// extract_subvector(shuffle(x,y,m2),0)

38099

static SDValue combineX86ShuffleChainWithExtract(

38100

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

38101

bool HasVariableMask, bool AllowVariableCrossLaneMask,

38102

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

38103

const X86Subtarget &Subtarget) {

38104

unsigned NumMaskElts = BaseMask.size();

38105

unsigned NumInputs = Inputs.size();

38106

if (NumInputs == 0)

38107

return SDValue();

38108

38109

EVT RootVT = Root.getValueType();

38110

unsigned RootSizeInBits = RootVT.getSizeInBits();

38111

assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38111, __extension__
__PRETTY_FUNCTION__));

38112

38113

// Bail if we have any smaller inputs.

38114

if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {

38115

return Input.getValueSizeInBits() < RootSizeInBits;

38116

}))

38117

return SDValue();

38118

38119

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

38120

SmallVector<unsigned, 4> Offsets(NumInputs, 0);

38121

38122

// Peek through subvectors.

38123

// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?

38124

unsigned WideSizeInBits = RootSizeInBits;

38125

for (unsigned i = 0; i != NumInputs; ++i) {

38126

SDValue &Src = WideInputs[i];

38127

unsigned &Offset = Offsets[i];

38128

Src = peekThroughBitcasts(Src);

38129

EVT BaseVT = Src.getValueType();

38130

while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

38131

Offset += Src.getConstantOperandVal(1);

38132

Src = Src.getOperand(0);

38133

}

38134

WideSizeInBits = std::max(WideSizeInBits,

38135

(unsigned)Src.getValueSizeInBits());

38136

assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38137, __extension__
__PRETTY_FUNCTION__))

38137

"Unexpected subvector extraction")(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38137, __extension__
__PRETTY_FUNCTION__));

38138

Offset /= BaseVT.getVectorNumElements();

38139

Offset *= NumMaskElts;

38140

}

38141

38142

// Bail if we're always extracting from the lowest subvectors,

38143

// combineX86ShuffleChain should match this for the current width.

38144

if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))

38145

return SDValue();

38146

38147

unsigned Scale = WideSizeInBits / RootSizeInBits;

38148

assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38149, __extension__
__PRETTY_FUNCTION__))

38149

"Unexpected subvector extraction")(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38149, __extension__
__PRETTY_FUNCTION__));

38150

38151

// If the src vector types aren't the same, see if we can extend

38152

// them to match each other.

38153

// TODO: Support different scalar types?

38154

EVT WideSVT = WideInputs[0].getValueType().getScalarType();

38155

if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {

38156

return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||

38157

Op.getValueType().getScalarType() != WideSVT;

38158

}))

38159

return SDValue();

38160

38161

// Create new mask for larger type.

38162

for (unsigned i = 1; i != NumInputs; ++i)

38163

Offsets[i] += i * Scale * NumMaskElts;

38164

38165

SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());

38166

for (int &M : WideMask) {

38167

if (M < 0)

38168

continue;

38169

M = (M % NumMaskElts) + Offsets[M / NumMaskElts];

38170

}

38171

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

38172

38173

// Remove unused/repeated shuffle source ops.

38174

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

38175

assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38175, __extension__
__PRETTY_FUNCTION__));

38176

38177

if (WideInputs.size() > 2)

38178

return SDValue();

38179

38180

// Increase depth for every upper subvector we've peeked through.

38181

Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

38182

38183

// Attempt to combine wider chain.

38184

// TODO: Can we use a better Root?

38185

SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

38186

WideInputs.back().getValueSizeInBits()

38187

? WideInputs.front()

38188

: WideInputs.back();

38189

if (SDValue WideShuffle =

38190

combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,

38191

HasVariableMask, AllowVariableCrossLaneMask,

38192

AllowVariablePerLaneMask, DAG, Subtarget)) {

38193

WideShuffle =

38194

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

38195

return DAG.getBitcast(RootVT, WideShuffle);

38196

}

38197

return SDValue();

38198

}

38199

38200

// Canonicalize the combined shuffle mask chain with horizontal ops.

38201

// NOTE: This may update the Ops and Mask.

38202

static SDValue canonicalizeShuffleMaskWithHorizOp(

38203

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

38204

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

38205

const X86Subtarget &Subtarget) {

38206

if (Mask.empty() || Ops.empty())

38207

return SDValue();

38208

38209

SmallVector<SDValue> BC;

38210

for (SDValue Op : Ops)

38211

BC.push_back(peekThroughBitcasts(Op));

38212

38213

// All ops must be the same horizop + type.

38214

SDValue BC0 = BC[0];

38215

EVT VT0 = BC0.getValueType();

38216

unsigned Opcode0 = BC0.getOpcode();

38217

if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

38218

return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

38219

}))

38220

return SDValue();

38221

38222

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

38223

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

38224

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

38225

if (!isHoriz && !isPack)

38226

return SDValue();

38227

38228

// Do all ops have a single use?

38229

bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

38230

return Op.hasOneUse() &&

38231

peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

38232

});

38233

38234

int NumElts = VT0.getVectorNumElements();

38235

int NumLanes = VT0.getSizeInBits() / 128;

38236

int NumEltsPerLane = NumElts / NumLanes;

38237

int NumHalfEltsPerLane = NumEltsPerLane / 2;

38238

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

38239

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

38240

38241

if (NumEltsPerLane >= 4 &&

38242

(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

38243

SmallVector<int> LaneMask, ScaledMask;

38244

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

38245

scaleShuffleElements(LaneMask, 4, ScaledMask)) {

38246

// See if we can remove the shuffle by resorting the HOP chain so that

38247

// the HOP args are pre-shuffled.

38248

// TODO: Generalize to any sized/depth chain.

38249

// TODO: Add support for PACKSS/PACKUS.

38250

if (isHoriz) {

38251

// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

38252

auto GetHOpSrc = [&](int M) {

38253

if (M == SM_SentinelUndef)

38254

return DAG.getUNDEF(VT0);

38255

if (M == SM_SentinelZero)

38256

return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

38257

SDValue Src0 = BC[M / 4];

38258

SDValue Src1 = Src0.getOperand((M % 4) >= 2);

38259

if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

38260

return Src1.getOperand(M % 2);

38261

return SDValue();

38262

};

38263

SDValue M0 = GetHOpSrc(ScaledMask[0]);

38264

SDValue M1 = GetHOpSrc(ScaledMask[1]);

38265

SDValue M2 = GetHOpSrc(ScaledMask[2]);

38266

SDValue M3 = GetHOpSrc(ScaledMask[3]);

38267

if (M0 && M1 && M2 && M3) {

38268

SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

38269

SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

38270

return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

38271

}

38272

}

38273

// shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

38274

if (Ops.size() >= 2) {

38275

SDValue LHS, RHS;

38276

auto GetHOpSrc = [&](int M, int &OutM) {

38277

// TODO: Support SM_SentinelZero

38278

if (M < 0)

38279

return M == SM_SentinelUndef;

38280

SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

38281

if (!LHS || LHS == Src) {

38282

LHS = Src;

38283

OutM = (M % 2);

38284

return true;

38285

}

38286

if (!RHS || RHS == Src) {

38287

RHS = Src;

38288

OutM = (M % 2) + 2;

38289

return true;

38290

}

38291

return false;

38292

};

38293

int PostMask[4] = {-1, -1, -1, -1};

38294

if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

38295

GetHOpSrc(ScaledMask[1], PostMask[1]) &&

38296

GetHOpSrc(ScaledMask[2], PostMask[2]) &&

38297

GetHOpSrc(ScaledMask[3], PostMask[3])) {

38298

LHS = DAG.getBitcast(SrcVT, LHS);

38299

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

38300

SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

38301

// Use SHUFPS for the permute so this will work on SSE3 targets,

38302

// shuffle combining and domain handling will simplify this later on.

38303

MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

38304

Res = DAG.getBitcast(ShuffleVT, Res);

38305

return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

38306

getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

38307

}

38308

}

38309

}

38310

}

38311

38312

if (2 < Ops.size())

38313

return SDValue();

38314

38315

SDValue BC1 = BC[BC.size() - 1];

38316

if (Mask.size() == VT0.getVectorNumElements()) {

38317

// Canonicalize binary shuffles of horizontal ops that use the

38318

// same sources to an unary shuffle.

38319

// TODO: Try to perform this fold even if the shuffle remains.

38320

if (Ops.size() == 2) {

38321

auto ContainsOps = [](SDValue HOp, SDValue Op) {

38322

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

38323

};

38324

// Commute if all BC0's ops are contained in BC1.

38325

if (ContainsOps(BC1, BC0.getOperand(0)) &&

38326

ContainsOps(BC1, BC0.getOperand(1))) {

38327

ShuffleVectorSDNode::commuteMask(Mask);

38328

std::swap(Ops[0], Ops[1]);

38329

std::swap(BC0, BC1);

38330

}

38331

38332

// If BC1 can be represented by BC0, then convert to unary shuffle.

38333

if (ContainsOps(BC0, BC1.getOperand(0)) &&

38334

ContainsOps(BC0, BC1.getOperand(1))) {

38335

for (int &M : Mask) {

38336

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

38337

continue;

38338

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

38339

M -= NumElts + (SubLane * NumHalfEltsPerLane);

38340

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

38341

M += NumHalfEltsPerLane;

38342

}

38343

}

38344

}

38345

38346

// Canonicalize unary horizontal ops to only refer to lower halves.

38347

for (int i = 0; i != NumElts; ++i) {

38348

int &M = Mask[i];

38349

if (isUndefOrZero(M))

38350

continue;

38351

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

38352

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

38353

M -= NumHalfEltsPerLane;

38354

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

38355

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

38356

M -= NumHalfEltsPerLane;

38357

}

38358

}

38359

38360

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

38361

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

38362

// represents the LHS/RHS inputs for the lower/upper halves.

38363

SmallVector<int, 16> TargetMask128, WideMask128;

38364

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

38365

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

38366

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38366, __extension__
__PRETTY_FUNCTION__));

38367

bool SingleOp = (Ops.size() == 1);

38368

if (isPack || OneUseOps ||

38369

shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

38370

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

38371

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

38372

Lo = Lo.getOperand(WideMask128[0] & 1);

38373

Hi = Hi.getOperand(WideMask128[1] & 1);

38374

if (SingleOp) {

38375

SDValue Undef = DAG.getUNDEF(SrcVT);

38376

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

38377

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

38378

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

38379

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

38380

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

38381

}

38382

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

38383

}

38384

}

38385

38386

return SDValue();

38387

}

38388

38389

// Attempt to constant fold all of the constant source ops.

38390

// Returns true if the entire shuffle is folded to a constant.

38391

// TODO: Extend this to merge multiple constant Ops and update the mask.

38392

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

38393

ArrayRef<int> Mask, SDValue Root,

38394

bool HasVariableMask,

38395

SelectionDAG &DAG,

38396

const X86Subtarget &Subtarget) {

38397

MVT VT = Root.getSimpleValueType();

38398

38399

unsigned SizeInBits = VT.getSizeInBits();

38400

unsigned NumMaskElts = Mask.size();

38401

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

38402

unsigned NumOps = Ops.size();

38403

38404

// Extract constant bits from each source op.

38405

bool OneUseConstantOp = false;

38406

SmallVector<APInt, 16> UndefEltsOps(NumOps);

38407

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

38408

for (unsigned i = 0; i != NumOps; ++i) {

38409

SDValue SrcOp = Ops[i];

38410

OneUseConstantOp |= SrcOp.hasOneUse();

38411

if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],

38412

RawBitsOps[i]))

38413

return SDValue();

38414

}

38415

38416

// If we're optimizing for size, only fold if at least one of the constants is

38417

// only used once or the combined shuffle has included a variable mask

38418

// shuffle, this is to avoid constant pool bloat.

38419

bool IsOptimizingSize = DAG.shouldOptForSize();

38420

if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)

38421

return SDValue();

38422

38423

// Shuffle the constant bits according to the mask.

38424

SDLoc DL(Root);

38425

APInt UndefElts(NumMaskElts, 0);

38426

APInt ZeroElts(NumMaskElts, 0);

38427

APInt ConstantElts(NumMaskElts, 0);

38428

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

38429

APInt::getZero(MaskSizeInBits));

38430

for (unsigned i = 0; i != NumMaskElts; ++i) {

38431

int M = Mask[i];

38432

if (M == SM_SentinelUndef) {

38433

UndefElts.setBit(i);

38434

continue;

38435

} else if (M == SM_SentinelZero) {

38436

ZeroElts.setBit(i);

38437

continue;

38438

}

38439

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__));

38440

38441

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

38442

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

38443

38444

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

38445

if (SrcUndefElts[SrcMaskIdx]) {

38446

UndefElts.setBit(i);

38447

continue;

38448

}

38449

38450

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

38451

APInt &Bits = SrcEltBits[SrcMaskIdx];

38452

if (!Bits) {

38453

ZeroElts.setBit(i);

38454

continue;

38455

}

38456

38457

ConstantElts.setBit(i);

38458

ConstantBitData[i] = Bits;

38459

}

38460

38461

38462

// Attempt to create a zero vector.

38463

if ((UndefElts | ZeroElts).isAllOnes())

38464

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

38465

38466

// Create the constant data.

38467

MVT MaskSVT;

38468

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

38469

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

38470

else

38471

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

38472

38473

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

38474

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

38475

return SDValue();

38476

38477

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

38478

return DAG.getBitcast(VT, CstOp);

38479

}

38480

38481

namespace llvm {

38482

namespace X86 {

38483

enum {

38484

MaxShuffleCombineDepth = 8

38485

};

38486

}

38487

} // namespace llvm

38488

38489

/// Fully generic combining of x86 shuffle instructions.

38490

///

38491

/// This should be the last combine run over the x86 shuffle instructions. Once

38492

/// they have been fully optimized, this will recursively consider all chains

38493

/// of single-use shuffle instructions, build a generic model of the cumulative

38494

/// shuffle operation, and check for simpler instructions which implement this

38495

/// operation. We use this primarily for two purposes:

38496

///

38497

/// 1) Collapse generic shuffles to specialized single instructions when

38498

/// equivalent. In most cases, this is just an encoding size win, but

38499

/// sometimes we will collapse multiple generic shuffles into a single

38500

/// special-purpose shuffle.

38501

/// 2) Look for sequences of shuffle instructions with 3 or more total

38502

/// instructions, and replace them with the slightly more expensive SSSE3

38503

/// PSHUFB instruction if available. We do this as the last combining step

38504

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

38505

/// a suitable short sequence of other instructions. The PSHUFB will either

38506

/// use a register or have to read from memory and so is slightly (but only

38507

/// slightly) more expensive than the other shuffle instructions.

38508

///

38509

/// Because this is inherently a quadratic operation (for each shuffle in

38510

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

38511

/// This should never be an issue in practice as the shuffle lowering doesn't

38512

/// produce sequences of more than 8 instructions.

38513

///

38514

/// FIXME: We will currently miss some cases where the redundant shuffling

38515

/// would simplify under the threshold for PSHUFB formation because of

38516

/// combine-ordering. To fix this, we should do the redundant instruction

38517

/// combining in this recursive walk.

38518

static SDValue combineX86ShufflesRecursively(

38519

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

38520

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

38521

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,

38522

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

38523

const X86Subtarget &Subtarget) {

38524

assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))

38525

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__))

38526

"Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38526, __extension__
__PRETTY_FUNCTION__));

38527

MVT RootVT = Root.getSimpleValueType();

38528

assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38528, __extension__
__PRETTY_FUNCTION__));

38529

unsigned RootSizeInBits = RootVT.getSizeInBits();

38530

38531

// Bound the depth of our recursive combine because this is ultimately

38532

// quadratic in nature.

38533

if (Depth >= MaxDepth)

38534

return SDValue();

38535

38536

// Directly rip through bitcasts to find the underlying operand.

38537

SDValue Op = SrcOps[SrcOpIndex];

38538

Op = peekThroughOneUseBitcasts(Op);

38539

38540

EVT VT = Op.getValueType();

38541

if (!VT.isVector() || !VT.isSimple())

38542

return SDValue(); // Bail if we hit a non-simple non-vector.

38543

38544

// FIXME: Just bail on f16 for now.

38545

if (VT.getVectorElementType() == MVT::f16)

38546

return SDValue();

38547

38548

assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38549, __extension__
__PRETTY_FUNCTION__))

38549

"Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38549, __extension__
__PRETTY_FUNCTION__));

38550

38551

// Extract target shuffle mask and resolve sentinels and inputs.

38552

// TODO - determine Op's demanded elts from RootMask.

38553

SmallVector<int, 64> OpMask;

38554

SmallVector<SDValue, 2> OpInputs;

38555

APInt OpUndef, OpZero;

38556

APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

38557

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

38558

if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

38559

OpZero, DAG, Depth, false)) {

38560

// Shuffle inputs must not be larger than the shuffle result.

38561

// TODO: Relax this for single input faux shuffles (e.g. trunc).

38562

if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

38563

return OpInput.getValueSizeInBits() > VT.getSizeInBits();

38564

}))

38565

return SDValue();

38566

} else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

38567

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

38568

!isNullConstant(Op.getOperand(1))) {

38569

SDValue SrcVec = Op.getOperand(0);

38570

int ExtractIdx = Op.getConstantOperandVal(1);

38571

unsigned NumElts = VT.getVectorNumElements();

38572

OpInputs.assign({SrcVec});

38573

OpMask.assign(NumElts, SM_SentinelUndef);

38574

std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

38575

OpZero = OpUndef = APInt::getNullValue(NumElts);

38576

} else {

38577

return SDValue();

38578

}

38579

38580

// If the shuffle result was smaller than the root, we need to adjust the

38581

// mask indices and pad the mask with undefs.

38582

if (RootSizeInBits > VT.getSizeInBits()) {

38583

unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

38584

unsigned OpMaskSize = OpMask.size();

38585

if (OpInputs.size() > 1) {

38586

unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

38587

for (int &M : OpMask) {

38588

if (M < 0)

38589

continue;

38590

int EltIdx = M % OpMaskSize;

38591

int OpIdx = M / OpMaskSize;

38592

M = (PaddedMaskSize * OpIdx) + EltIdx;

38593

}

38594

}

38595

OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

38596

OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

38597

OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

38598

}

38599

38600

SmallVector<int, 64> Mask;

38601

SmallVector<SDValue, 16> Ops;

38602

38603

// We don't need to merge masks if the root is empty.

38604

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

38605

if (EmptyRoot) {

38606

// Only resolve zeros if it will remove an input, otherwise we might end

38607

// up in an infinite loop.

38608

bool ResolveKnownZeros = true;

38609

if (!OpZero.isZero()) {

38610

APInt UsedInputs = APInt::getZero(OpInputs.size());

38611

for (int i = 0, e = OpMask.size(); i != e; ++i) {

38612

int M = OpMask[i];

38613

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

38614

continue;

38615

UsedInputs.setBit(M / OpMask.size());

38616

if (UsedInputs.isAllOnes()) {

38617

ResolveKnownZeros = false;

38618

break;

38619

}

38620

}

38621

}

38622

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

38623

ResolveKnownZeros);

38624

38625

Mask = OpMask;

38626

Ops.append(OpInputs.begin(), OpInputs.end());

38627

} else {

38628

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

38629

38630

// Add the inputs to the Ops list, avoiding duplicates.

38631

Ops.append(SrcOps.begin(), SrcOps.end());

38632

38633

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

38634

// Attempt to find an existing match.

38635

SDValue InputBC = peekThroughBitcasts(Input);

38636

for (int i = 0, e = Ops.size(); i < e; ++i)

38637

if (InputBC == peekThroughBitcasts(Ops[i]))

38638

return i;

38639

// Match failed - should we replace an existing Op?

38640

if (InsertionPoint >= 0) {

38641

Ops[InsertionPoint] = Input;

38642

return InsertionPoint;

38643

}

38644

// Add to the end of the Ops list.

38645

Ops.push_back(Input);

38646

return Ops.size() - 1;

38647

};

38648

38649

SmallVector<int, 2> OpInputIdx;

38650

for (SDValue OpInput : OpInputs)

38651

OpInputIdx.push_back(

38652

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

38653

38654

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))

38655

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))

38656

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))

38657

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))

38658

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__))

38659

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38659, __extension__
__PRETTY_FUNCTION__));

38660

38661

// This function can be performance-critical, so we rely on the power-of-2

38662

// knowledge that we have about the mask sizes to replace div/rem ops with

38663

// bit-masks and shifts.

38664

assert(isPowerOf2_32(RootMask.size()) &&(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38665, __extension__
__PRETTY_FUNCTION__))

38665

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38665, __extension__
__PRETTY_FUNCTION__));

38666

assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38666, __extension__
__PRETTY_FUNCTION__));

38667

unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());

38668

unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

38669

38670

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

38671

unsigned RootRatio =

38672

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

38673

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

38674

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38675, __extension__
__PRETTY_FUNCTION__))

38675

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38675, __extension__
__PRETTY_FUNCTION__));

38676

38677

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38677, __extension__
__PRETTY_FUNCTION__));

38678

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38678, __extension__
__PRETTY_FUNCTION__));

38679

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38679, __extension__
__PRETTY_FUNCTION__));

38680

unsigned RootRatioLog2 = countTrailingZeros(RootRatio);

38681

unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

38682

38683

Mask.resize(MaskWidth, SM_SentinelUndef);

38684

38685

// Merge this shuffle operation's mask into our accumulated mask. Note that

38686

// this shuffle's mask will be the first applied to the input, followed by

38687

// the root mask to get us all the way to the root value arrangement. The

38688

// reason for this order is that we are recursing up the operation chain.

38689

for (unsigned i = 0; i < MaskWidth; ++i) {

38690

unsigned RootIdx = i >> RootRatioLog2;

38691

if (RootMask[RootIdx] < 0) {

38692

// This is a zero or undef lane, we're done.

38693

Mask[i] = RootMask[RootIdx];

38694

continue;

38695

}

38696

38697

unsigned RootMaskedIdx =

38698

RootRatio == 1

38699

? RootMask[RootIdx]

38700

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

38701

38702

// Just insert the scaled root mask value if it references an input other

38703

// than the SrcOp we're currently inserting.

38704

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

38705

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

38706

Mask[i] = RootMaskedIdx;

38707

continue;

38708

}

38709

38710

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

38711

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

38712

if (OpMask[OpIdx] < 0) {

38713

// The incoming lanes are zero or undef, it doesn't matter which ones we

38714

// are using.

38715

Mask[i] = OpMask[OpIdx];

38716

continue;

38717

}

38718

38719

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

38720

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

38721

: (OpMask[OpIdx] << OpRatioLog2) +

38722

(RootMaskedIdx & (OpRatio - 1));

38723

38724

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

38725

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

38726

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38726, __extension__
__PRETTY_FUNCTION__));

38727

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

38728

38729

Mask[i] = OpMaskedIdx;

38730

}

38731

}

38732

38733

// Remove unused/repeated shuffle source ops.

38734

resolveTargetShuffleInputsAndMask(Ops, Mask);

38735

38736

// Handle the all undef/zero/ones cases early.

38737

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

38738

return DAG.getUNDEF(RootVT);

38739

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

38740

return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));

38741

if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

38742

none_of(Mask, [](int M) { return M == SM_SentinelZero; }))

38743

return getOnesVector(RootVT, DAG, SDLoc(Root));

38744

38745

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38745, __extension__
__PRETTY_FUNCTION__));

38746

HasVariableMask |= IsOpVariableMask;

38747

38748

// Update the list of shuffle nodes that have been combined so far.

38749

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

38750

SrcNodes.end());

38751

CombinedNodes.push_back(Op.getNode());

38752

38753

// See if we can recurse into each shuffle source op (if it's a target

38754

// shuffle). The source op should only be generally combined if it either has

38755

// a single use (i.e. current Op) or all its users have already been combined,

38756

// if not then we can still combine but should prevent generation of variable

38757

// shuffles to avoid constant pool bloat.

38758

// Don't recurse if we already have more source ops than we can combine in

38759

// the remaining recursion depth.

38760

if (Ops.size() < (MaxDepth - Depth)) {

38761

for (int i = 0, e = Ops.size(); i < e; ++i) {

38762

// For empty roots, we need to resolve zeroable elements before combining

38763

// them with other shuffles.

38764

SmallVector<int, 64> ResolvedMask = Mask;

38765

if (EmptyRoot)

38766

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

38767

bool AllowCrossLaneVar = false;

38768

bool AllowPerLaneVar = false;

38769

if (Ops[i].getNode()->hasOneUse() ||

38770

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

38771

AllowCrossLaneVar = AllowVariableCrossLaneMask;

38772

AllowPerLaneVar = AllowVariablePerLaneMask;

38773

}

38774

if (SDValue Res = combineX86ShufflesRecursively(

38775

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

38776

HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,

38777

Subtarget))

38778

return Res;

38779

}

38780

}

38781

38782

// Attempt to constant fold all of the constant source ops.

38783

if (SDValue Cst = combineX86ShufflesConstants(

38784

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

38785

return Cst;

38786

38787

// If constant fold failed and we only have constants - then we have

38788

// multiple uses by a single non-variable shuffle - just bail.

38789

if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

38790

APInt UndefElts;

38791

SmallVector<APInt> RawBits;

38792

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

38793

return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

38794

RawBits);

38795

})) {

38796

return SDValue();

38797

}

38798

38799

// Canonicalize the combined shuffle mask chain with horizontal ops.

38800

// NOTE: This will update the Ops and Mask.

38801

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

38802

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

38803

return DAG.getBitcast(RootVT, HOp);

38804

38805

// Try to refine our inputs given our knowledge of target shuffle mask.

38806

for (auto I : enumerate(Ops)) {

38807

int OpIdx = I.index();

38808

SDValue &Op = I.value();

38809

38810

// What range of shuffle mask element values results in picking from Op?

38811

int Lo = OpIdx * Mask.size();

38812

int Hi = Lo + Mask.size();

38813

38814

// Which elements of Op do we demand, given the mask's granularity?

38815

APInt OpDemandedElts(Mask.size(), 0);

38816

for (int MaskElt : Mask) {

38817

if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

38818

int OpEltIdx = MaskElt - Lo;

38819

OpDemandedElts.setBit(OpEltIdx);

38820

}

38821

}

38822

38823

// Is the shuffle result smaller than the root?

38824

if (Op.getValueSizeInBits() < RootSizeInBits) {

38825

// We padded the mask with undefs. But we now need to undo that.

38826

unsigned NumExpectedVectorElts = Mask.size();

38827

unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

38828

unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

38829

assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))

38830

NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__))

38831

"Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38831, __extension__
__PRETTY_FUNCTION__));

38832

OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

38833

}

38834

38835

// The Op itself may be of different VT, so we need to scale the mask.

38836

unsigned NumOpElts = Op.getValueType().getVectorNumElements();

38837

APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);

38838

38839

// Can this operand be simplified any further, given it's demanded elements?

38840

if (SDValue NewOp =

38841

DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(

38842

Op, OpScaledDemandedElts, DAG))

38843

Op = NewOp;

38844

}

38845

// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?

38846

38847

// Widen any subvector shuffle inputs we've collected.

38848

// TODO: Remove this to avoid generating temporary nodes, we should only

38849

// widen once combineX86ShuffleChain has found a match.

38850

if (any_of(Ops, [RootSizeInBits](SDValue Op) {

38851

return Op.getValueSizeInBits() < RootSizeInBits;

38852

})) {

38853

for (SDValue &Op : Ops)

38854

if (Op.getValueSizeInBits() < RootSizeInBits)

38855

Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

38856

RootSizeInBits);

38857

// Reresolve - we might have repeated subvector sources.

38858

resolveTargetShuffleInputsAndMask(Ops, Mask);

38859

}

38860

38861

// We can only combine unary and binary shuffle mask cases.

38862

if (Ops.size() <= 2) {

38863

// Minor canonicalization of the accumulated shuffle mask to make it easier

38864

// to match below. All this does is detect masks with sequential pairs of

38865

// elements, and shrink them to the half-width mask. It does this in a loop

38866

// so it will reduce the size of the mask to the minimal width mask which

38867

// performs an equivalent shuffle.

38868

while (Mask.size() > 1) {

38869

SmallVector<int, 64> WidenedMask;

38870

if (!canWidenShuffleElements(Mask, WidenedMask))

38871

break;

38872

Mask = std::move(WidenedMask);

38873

}

38874

38875

// Canonicalization of binary shuffle masks to improve pattern matching by

38876

// commuting the inputs.

38877

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

38878

ShuffleVectorSDNode::commuteMask(Mask);

38879

std::swap(Ops[0], Ops[1]);

38880

}

38881

38882

// Finally, try to combine into a single shuffle instruction.

38883

return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,

38884

AllowVariableCrossLaneMask,

38885

AllowVariablePerLaneMask, DAG, Subtarget);

38886

}

38887

38888

// If that failed and any input is extracted then try to combine as a

38889

// shuffle with the larger type.

38890

return combineX86ShuffleChainWithExtract(

38891

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

38892

AllowVariablePerLaneMask, DAG, Subtarget);

38893

}

38894

38895

/// Helper entry wrapper to combineX86ShufflesRecursively.

38896

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

38897

const X86Subtarget &Subtarget) {

38898

return combineX86ShufflesRecursively(

38899

{Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,

38900

/*HasVarMask*/ false,

38901

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,

38902

Subtarget);

38903

}

38904

38905

/// Get the PSHUF-style mask from PSHUF node.

38906

///

38907

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

38908

/// PSHUF-style masks that can be reused with such instructions.

38909

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

38910

MVT VT = N.getSimpleValueType();

38911

SmallVector<int, 4> Mask;

38912

SmallVector<SDValue, 2> Ops;

38913

bool HaveMask =

38914

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);

38915

(void)HaveMask;

38916

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 38916
, __extension__ __PRETTY_FUNCTION__));

38917

38918

// If we have more than 128-bits, only the low 128-bits of shuffle mask

38919

// matter. Check that the upper masks are repeats and remove them.

38920

if (VT.getSizeInBits() > 128) {

38921

int LaneElts = 128 / VT.getScalarSizeInBits();

38922

#ifndef NDEBUG

38923

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

38924

for (int j = 0; j < LaneElts; ++j)

38925

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38926, __extension__
__PRETTY_FUNCTION__))

38926

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38926, __extension__
__PRETTY_FUNCTION__));

38927

#endif

38928

Mask.resize(LaneElts);

38929

}

38930

38931

switch (N.getOpcode()) {

38932

case X86ISD::PSHUFD:

38933

return Mask;

38934

case X86ISD::PSHUFLW:

38935

Mask.resize(4);

38936

return Mask;

38937

case X86ISD::PSHUFHW:

38938

Mask.erase(Mask.begin(), Mask.begin() + 4);

38939

for (int &M : Mask)

38940

M -= 4;

38941

return Mask;

38942

default:

38943

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38943);

38944

}

38945

}

38946

38947

/// Search for a combinable shuffle across a chain ending in pshufd.

38948

///

38949

/// We walk up the chain and look for a combinable shuffle, skipping over

38950

/// shuffles that we could hoist this shuffle's transformation past without

38951

/// altering anything.

38952

static SDValue

38953

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

38954

SelectionDAG &DAG) {

38955

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38956, __extension__
__PRETTY_FUNCTION__))

38956

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38956, __extension__
__PRETTY_FUNCTION__));

38957

SDLoc DL(N);

38958

38959

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

38960

// of the shuffles in the chain so that we can form a fresh chain to replace

38961

// this one.

38962

SmallVector<SDValue, 8> Chain;

38963

SDValue V = N.getOperand(0);

38964

for (; V.hasOneUse(); V = V.getOperand(0)) {

38965

switch (V.getOpcode()) {

38966

default:

38967

return SDValue(); // Nothing combined!

38968

38969

case ISD::BITCAST:

38970

// Skip bitcasts as we always know the type for the target specific

38971

// instructions.

38972

continue;

38973

38974

case X86ISD::PSHUFD:

38975

// Found another dword shuffle.

38976

break;

38977

38978

case X86ISD::PSHUFLW:

38979

// Check that the low words (being shuffled) are the identity in the

38980

// dword shuffle, and the high words are self-contained.

38981

if (Mask[0] != 0 || Mask[1] != 1 ||

38982

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

38983

return SDValue();

38984

38985

Chain.push_back(V);

38986

continue;

38987

38988

case X86ISD::PSHUFHW:

38989

// Check that the high words (being shuffled) are the identity in the

38990

// dword shuffle, and the low words are self-contained.

38991

if (Mask[2] != 2 || Mask[3] != 3 ||

38992

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

38993

return SDValue();

38994

38995

Chain.push_back(V);

38996

continue;

38997

38998

case X86ISD::UNPCKL:

38999

case X86ISD::UNPCKH:

39000

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

39001

// shuffle into a preceding word shuffle.

39002

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

39003

V.getSimpleValueType().getVectorElementType() != MVT::i16)

39004

return SDValue();

39005

39006

// Search for a half-shuffle which we can combine with.

39007

unsigned CombineOp =

39008

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

39009

if (V.getOperand(0) != V.getOperand(1) ||

39010

!V->isOnlyUserOf(V.getOperand(0).getNode()))

39011

return SDValue();

39012

Chain.push_back(V);

39013

V = V.getOperand(0);

39014

do {

39015

switch (V.getOpcode()) {

39016

default:

39017

return SDValue(); // Nothing to combine.

39018

39019

case X86ISD::PSHUFLW:

39020

case X86ISD::PSHUFHW:

39021

if (V.getOpcode() == CombineOp)

39022

break;

39023

39024

Chain.push_back(V);

39025

39026

LLVM_FALLTHROUGH[[gnu::fallthrough]];

39027

case ISD::BITCAST:

39028

V = V.getOperand(0);

39029

continue;

39030

}

39031

break;

39032

} while (V.hasOneUse());

39033

break;

39034

}

39035

// Break out of the loop if we break out of the switch.

39036

break;

39037

}

39038

39039

if (!V.hasOneUse())

39040

// We fell out of the loop without finding a viable combining instruction.

39041

return SDValue();

39042

39043

// Merge this node's mask and our incoming mask.

39044

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

39045

for (int &M : Mask)

39046

M = VMask[M];

39047

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

39048

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

39049

39050

// Rebuild the chain around this new shuffle.

39051

while (!Chain.empty()) {

39052

SDValue W = Chain.pop_back_val();

39053

39054

if (V.getValueType() != W.getOperand(0).getValueType())

39055

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

39056

39057

switch (W.getOpcode()) {

39058

default:

39059

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39059);

39060

39061

case X86ISD::UNPCKL:

39062

case X86ISD::UNPCKH:

39063

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

39064

break;

39065

39066

case X86ISD::PSHUFD:

39067

case X86ISD::PSHUFLW:

39068

case X86ISD::PSHUFHW:

39069

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

39070

break;

39071

}

39072

}

39073

if (V.getValueType() != N.getValueType())

39074

V = DAG.getBitcast(N.getValueType(), V);

39075

39076

// Return the new chain to replace N.

39077

return V;

39078

}

39079

39080

// Attempt to commute shufps LHS loads:

39081

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

39082

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

39083

SelectionDAG &DAG) {

39084

// TODO: Add vXf64 support.

39085

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

39086

return SDValue();

39087

39088

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

39089

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

39090

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

39091

return SDValue();

39092

SDValue N0 = V.getOperand(0);

39093

SDValue N1 = V.getOperand(1);

39094

unsigned Imm = V.getConstantOperandVal(2);

39095

const X86Subtarget &Subtarget =

39096

static_cast<const X86Subtarget &>(DAG.getSubtarget());

39097

if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

39098

X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

39099

return SDValue();

39100

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

39101

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

39102

DAG.getTargetConstant(Imm, DL, MVT::i8));

39103

};

39104

39105

switch (N.getOpcode()) {

39106

case X86ISD::VPERMILPI:

39107

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

39108

unsigned Imm = N.getConstantOperandVal(1);

39109

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

39110

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

39111

}

39112

break;

39113

case X86ISD::SHUFP: {

39114

SDValue N0 = N.getOperand(0);

39115

SDValue N1 = N.getOperand(1);

39116

unsigned Imm = N.getConstantOperandVal(2);

39117

if (N0 == N1) {

39118

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

39119

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

39120

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

39121

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

39122

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

39123

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

39124

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

39125

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

39126

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

39127

}

39128

break;

39129

}

39130

}

39131

39132

return SDValue();

39133

}

39134

39135

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

39136

static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,

39137

const SDLoc &DL) {

39138

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

39139

EVT ShuffleVT = N.getValueType();

39140

39141

auto IsMergeableWithShuffle = [](SDValue Op) {

39142

// AllZeros/AllOnes constants are freely shuffled and will peek through

39143

// bitcasts. Other constant build vectors do not peek through bitcasts. Only

39144

// merge with target shuffles if it has one use so shuffle combining is

39145

// likely to kick in.

39146

return ISD::isBuildVectorAllOnes(Op.getNode()) ||

39147

ISD::isBuildVectorAllZeros(Op.getNode()) ||

39148

ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

39149

ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

39150

(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());

39151

};

39152

auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

39153

// Ensure we only shuffle whole vector src elements, unless its a logical

39154

// binops where we can more aggressively move shuffles from dst to src.

39155

return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||

39156

(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

39157

};

39158

39159

unsigned Opc = N.getOpcode();

39160

switch (Opc) {

39161

// Unary and Unary+Permute Shuffles.

39162

case X86ISD::PSHUFB: {

39163

// Don't merge PSHUFB if it contains zero'd elements.

39164

SmallVector<int> Mask;

39165

SmallVector<SDValue> Ops;

39166

if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,

39167

Mask))

39168

break;

39169

LLVM_FALLTHROUGH[[gnu::fallthrough]];

39170

}

39171

case X86ISD::VBROADCAST:

39172

case X86ISD::MOVDDUP:

39173

case X86ISD::PSHUFD:

39174

case X86ISD::PSHUFHW:

39175

case X86ISD::PSHUFLW:

39176

case X86ISD::VPERMI:

39177

case X86ISD::VPERMILPI: {

39178

if (N.getOperand(0).getValueType() == ShuffleVT &&

39179

N->isOnlyUserOf(N.getOperand(0).getNode())) {

39180

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

39181

unsigned SrcOpcode = N0.getOpcode();

39182

if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

39183

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

39184

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

39185

if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {

39186

SDValue LHS, RHS;

39187

Op00 = DAG.getBitcast(ShuffleVT, Op00);

39188

Op01 = DAG.getBitcast(ShuffleVT, Op01);

39189

if (N.getNumOperands() == 2) {

39190

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

39191

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

39192

} else {

39193

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

39194

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

39195

}

39196

EVT OpVT = N0.getValueType();

39197

return DAG.getBitcast(ShuffleVT,

39198

DAG.getNode(SrcOpcode, DL, OpVT,

39199

DAG.getBitcast(OpVT, LHS),

39200

DAG.getBitcast(OpVT, RHS)));

39201

}

39202

}

39203

}

39204

break;

39205

}

39206

// Binary and Binary+Permute Shuffles.

39207

case X86ISD::INSERTPS: {

39208

// Don't merge INSERTPS if it contains zero'd elements.

39209

unsigned InsertPSMask = N.getConstantOperandVal(2);

39210

unsigned ZeroMask = InsertPSMask & 0xF;

39211

if (ZeroMask != 0)

39212

break;

39213

LLVM_FALLTHROUGH[[gnu::fallthrough]];

39214

}

39215

case X86ISD::MOVSD:

39216

case X86ISD::MOVSS:

39217

case X86ISD::BLENDI:

39218

case X86ISD::SHUFP:

39219

case X86ISD::UNPCKH:

39220

case X86ISD::UNPCKL: {

39221

if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

39222

N->isOnlyUserOf(N.getOperand(1).getNode())) {

39223

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

39224

SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

39225

unsigned SrcOpcode = N0.getOpcode();

39226

if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

39227

IsSafeToMoveShuffle(N0, SrcOpcode) &&

39228

IsSafeToMoveShuffle(N1, SrcOpcode)) {

39229

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

39230

SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

39231

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

39232

SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

39233

// Ensure the total number of shuffles doesn't increase by folding this

39234

// shuffle through to the source ops.

39235

if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

39236

(IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

39237

((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

39238

(IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

39239

SDValue LHS, RHS;

39240

Op00 = DAG.getBitcast(ShuffleVT, Op00);

39241

Op10 = DAG.getBitcast(ShuffleVT, Op10);

39242

Op01 = DAG.getBitcast(ShuffleVT, Op01);

39243

Op11 = DAG.getBitcast(ShuffleVT, Op11);

39244

if (N.getNumOperands() == 3) {

39245

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

39246

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

39247

} else {

39248

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

39249

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

39250

}

39251

EVT OpVT = N0.getValueType();

39252

return DAG.getBitcast(ShuffleVT,

39253

DAG.getNode(SrcOpcode, DL, OpVT,

39254

DAG.getBitcast(OpVT, LHS),

39255

DAG.getBitcast(OpVT, RHS)));

39256

}

39257

}

39258

}

39259

break;

39260

}

39261

}

39262

return SDValue();

39263

}

39264

39265

/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).

39266

static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

39267

SelectionDAG &DAG,

39268

const SDLoc &DL) {

39269

assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39269, __extension__
__PRETTY_FUNCTION__));

39270

39271

MVT VT = V.getSimpleValueType();

39272

SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

39273

SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

39274

unsigned SrcOpc0 = Src0.getOpcode();

39275

unsigned SrcOpc1 = Src1.getOpcode();

39276

EVT SrcVT0 = Src0.getValueType();

39277

EVT SrcVT1 = Src1.getValueType();

39278

39279

if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

39280

return SDValue();

39281

39282

switch (SrcOpc0) {

39283

case X86ISD::MOVDDUP: {

39284

SDValue LHS = Src0.getOperand(0);

39285

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

39286

SDValue Res =

39287

DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

39288

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

39289

return DAG.getBitcast(VT, Res);

39290

}

39291

case X86ISD::VPERMILPI:

39292

// TODO: Handle v4f64 permutes with different low/high lane masks.

39293

if (SrcVT0 == MVT::v4f64) {

39294

uint64_t Mask = Src0.getConstantOperandVal(1);

39295

if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

39296

break;

39297

}

39298

LLVM_FALLTHROUGH[[gnu::fallthrough]];

39299

case X86ISD::VSHLI:

39300

case X86ISD::VSRLI:

39301

case X86ISD::VSRAI:

39302

case X86ISD::PSHUFD:

39303

if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

39304

SDValue LHS = Src0.getOperand(0);

39305

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

39306

SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

39307

V.getOperand(2));

39308

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

39309

return DAG.getBitcast(VT, Res);

39310

}

39311

break;

39312

}

39313

39314

return SDValue();

39315

}

39316

39317

/// Try to combine x86 target specific shuffles.

39318

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

39319

TargetLowering::DAGCombinerInfo &DCI,

39320

const X86Subtarget &Subtarget) {

39321

SDLoc DL(N);

39322

MVT VT = N.getSimpleValueType();

39323

SmallVector<int, 4> Mask;

39324

unsigned Opcode = N.getOpcode();

39325

39326

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

39327

return R;

39328

39329

// Handle specific target shuffles.

39330

switch (Opcode) {

39331

case X86ISD::MOVDDUP: {

39332

SDValue Src = N.getOperand(0);

39333

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

39334

if (VT == MVT::v2f64 && Src.hasOneUse() &&

39335

ISD::isNormalLoad(Src.getNode())) {

39336

LoadSDNode *LN = cast<LoadSDNode>(Src);

39337

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

39338

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

39339

DCI.CombineTo(N.getNode(), Movddup);

39340

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

39341

DCI.recursivelyDeleteUnusedNodes(LN);

39342

return N; // Return N so it doesn't get rechecked!

39343

}

39344

}

39345

39346

return SDValue();

39347

}

39348

case X86ISD::VBROADCAST: {

39349

SDValue Src = N.getOperand(0);

39350

SDValue BC = peekThroughBitcasts(Src);

39351

EVT SrcVT = Src.getValueType();

39352

EVT BCVT = BC.getValueType();

39353

39354

// If broadcasting from another shuffle, attempt to simplify it.

39355

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

39356

if (isTargetShuffle(BC.getOpcode()) &&

39357

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

39358

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

39359

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

39360

SM_SentinelUndef);

39361

for (unsigned i = 0; i != Scale; ++i)

39362

DemandedMask[i] = i;

39363

if (SDValue Res = combineX86ShufflesRecursively(

39364

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

39365

X86::MaxShuffleCombineDepth,

39366

/*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,

39367

/*AllowPerLaneVarMask*/ true, DAG, Subtarget))

39368

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

39369

DAG.getBitcast(SrcVT, Res));

39370

}

39371

39372

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

39373

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

39374

if (Src.getOpcode() == ISD::BITCAST &&

39375

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

39376

DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

39377

FixedVectorType::isValidElementType(

39378

BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

39379

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

39380

VT.getVectorNumElements());

39381

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

39382

}

39383

39384

// Reduce broadcast source vector to lowest 128-bits.

39385

if (SrcVT.getSizeInBits() > 128)

39386

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

39387

extract128BitVector(Src, 0, DAG, DL));

39388

39389

// broadcast(scalar_to_vector(x)) -> broadcast(x).

39390

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

39391

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

39392

39393

// broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

39394

if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

39395

isNullConstant(Src.getOperand(1)) &&

39396

DAG.getTargetLoweringInfo().isTypeLegal(

39397

Src.getOperand(0).getValueType()))

39398

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

39399

39400

// Share broadcast with the longest vector and extract low subvector (free).

39401

// Ensure the same SDValue from the SDNode use is being used.

39402

for (SDNode *User : Src->uses())

39403

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

39404

Src == User->getOperand(0) &&

39405

User->getValueSizeInBits(0).getFixedSize() >

39406

VT.getFixedSizeInBits()) {

39407

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

39408

VT.getSizeInBits());

39409

}

39410

39411

// vbroadcast(scalarload X) -> vbroadcast_load X

39412

// For float loads, extract other uses of the scalar from the broadcast.

39413

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

39414

ISD::isNormalLoad(Src.getNode())) {

39415

LoadSDNode *LN = cast<LoadSDNode>(Src);

39416

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39417

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

39418

SDValue BcastLd =

39419

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

39420

LN->getMemoryVT(), LN->getMemOperand());

39421

// If the load value is used only by N, replace it via CombineTo N.

39422

bool NoReplaceExtract = Src.hasOneUse();

39423

DCI.CombineTo(N.getNode(), BcastLd);

39424

if (NoReplaceExtract) {

39425

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

39426

DCI.recursivelyDeleteUnusedNodes(LN);

39427

} else {

39428

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

39429

DAG.getIntPtrConstant(0, DL));

39430

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

39431

}

39432

return N; // Return N so it doesn't get rechecked!

39433

}

39434

39435

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

39436

// i16. So shrink it ourselves if we can make a broadcast_load.

39437

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

39438

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

39439

assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39439, __extension__
__PRETTY_FUNCTION__));

39440

SDValue TruncIn = Src.getOperand(0);

39441

39442

// If this is a truncate of a non extending load we can just narrow it to

39443

// use a broadcast_load.

39444

if (ISD::isNormalLoad(TruncIn.getNode())) {

39445

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

39446

// Unless its volatile or atomic.

39447

if (LN->isSimple()) {

39448

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39449

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

39450

SDValue BcastLd = DAG.getMemIntrinsicNode(

39451

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

39452

LN->getPointerInfo(), LN->getOriginalAlign(),

39453

LN->getMemOperand()->getFlags());

39454

DCI.CombineTo(N.getNode(), BcastLd);

39455

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

39456

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

39457

return N; // Return N so it doesn't get rechecked!

39458

}

39459

}

39460

39461

// If this is a truncate of an i16 extload, we can directly replace it.

39462

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

39463

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

39464

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

39465

if (LN->getMemoryVT().getSizeInBits() == 16) {

39466

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39467

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

39468

SDValue BcastLd =

39469

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

39470

LN->getMemoryVT(), LN->getMemOperand());

39471

DCI.CombineTo(N.getNode(), BcastLd);

39472

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

39473

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

39474

return N; // Return N so it doesn't get rechecked!

39475

}

39476

}

39477

39478

// If this is a truncate of load that has been shifted right, we can

39479

// offset the pointer and use a narrower load.

39480

if (TruncIn.getOpcode() == ISD::SRL &&

39481

TruncIn.getOperand(0).hasOneUse() &&

39482

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

39483

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

39484

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

39485

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

39486

// Make sure the shift amount and the load size are divisible by 16.

39487

// Don't do this if the load is volatile or atomic.

39488

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

39489

LN->isSimple()) {

39490

unsigned Offset = ShiftAmt / 8;

39491

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39492

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

39493

TypeSize::Fixed(Offset), DL);

39494

SDValue Ops[] = { LN->getChain(), Ptr };

39495

SDValue BcastLd = DAG.getMemIntrinsicNode(

39496

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

39497

LN->getPointerInfo().getWithOffset(Offset),

39498

LN->getOriginalAlign(),

39499

LN->getMemOperand()->getFlags());

39500

DCI.CombineTo(N.getNode(), BcastLd);

39501

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

39502

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

39503

return N; // Return N so it doesn't get rechecked!

39504

}

39505

}

39506

}

39507

39508

// vbroadcast(vzload X) -> vbroadcast_load X

39509

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

39510

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

39511

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

39512

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39513

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

39514

SDValue BcastLd =

39515

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

39516

LN->getMemoryVT(), LN->getMemOperand());

39517

DCI.CombineTo(N.getNode(), BcastLd);

39518

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

39519

DCI.recursivelyDeleteUnusedNodes(LN);

39520

return N; // Return N so it doesn't get rechecked!

39521

}

39522

}

39523

39524

// vbroadcast(vector load X) -> vbroadcast_load

39525

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

39526

SrcVT == MVT::v4i32) &&

39527

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

39528

LoadSDNode *LN = cast<LoadSDNode>(Src);

39529

// Unless the load is volatile or atomic.

39530

if (LN->isSimple()) {

39531

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39532

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

39533

SDValue BcastLd = DAG.getMemIntrinsicNode(

39534

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

39535

LN->getPointerInfo(), LN->getOriginalAlign(),

39536

LN->getMemOperand()->getFlags());

39537

DCI.CombineTo(N.getNode(), BcastLd);

39538

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

39539

DCI.recursivelyDeleteUnusedNodes(LN);

39540

return N; // Return N so it doesn't get rechecked!

39541

}

39542

}

39543

39544

return SDValue();

39545

}

39546

case X86ISD::VZEXT_MOVL: {

39547

SDValue N0 = N.getOperand(0);

39548

39549

// If this a vzmovl of a full vector load, replace it with a vzload, unless

39550

// the load is volatile.

39551

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

39552

auto *LN = cast<LoadSDNode>(N0);

39553

if (SDValue VZLoad =

39554

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

39555

DCI.CombineTo(N.getNode(), VZLoad);

39556

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

39557

DCI.recursivelyDeleteUnusedNodes(LN);

39558

return N;

39559

}

39560

}

39561

39562

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

39563

// and can just use a VZEXT_LOAD.

39564

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

39565

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

39566

auto *LN = cast<MemSDNode>(N0);

39567

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

39568

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

39569

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

39570

SDValue VZLoad =

39571

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

39572

LN->getMemoryVT(), LN->getMemOperand());

39573

DCI.CombineTo(N.getNode(), VZLoad);

39574

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

39575

DCI.recursivelyDeleteUnusedNodes(LN);

39576

return N;

39577

}

39578

}

39579

39580

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

39581

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

39582

// if the upper bits of the i64 are zero.

39583

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39584

N0.getOperand(0).hasOneUse() &&

39585

N0.getOperand(0).getValueType() == MVT::i64) {

39586

SDValue In = N0.getOperand(0);

39587

APInt Mask = APInt::getHighBitsSet(64, 32);

39588

if (DAG.MaskedValueIsZero(In, Mask)) {

39589

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

39590

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

39591

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

39592

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

39593

return DAG.getBitcast(VT, Movl);

39594

}

39595

}

39596

39597

// Load a scalar integer constant directly to XMM instead of transferring an

39598

// immediate value from GPR.

39599

// vzext_movl (scalar_to_vector C) --> load [C,0...]

39600

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

39601

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

39602

// Create a vector constant - scalar constant followed by zeros.

39603

EVT ScalarVT = N0.getOperand(0).getValueType();

39604

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

39605

unsigned NumElts = VT.getVectorNumElements();

39606

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

39607

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

39608

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

39609

39610

// Load the vector constant from constant pool.

39611

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

39612

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

39613

MachinePointerInfo MPI =

39614

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

39615

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

39616

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

39617

MachineMemOperand::MOLoad);

39618

}

39619

}

39620

39621

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

39622

// insert into a zero vector. This helps get VZEXT_MOVL closer to

39623

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

39624

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

39625

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

39626

SDValue V = peekThroughOneUseBitcasts(N0);

39627

39628

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

39629

isNullConstant(V.getOperand(2))) {

39630

SDValue In = V.getOperand(1);

39631

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

39632

In.getValueSizeInBits() /

39633

VT.getScalarSizeInBits());

39634

In = DAG.getBitcast(SubVT, In);

39635

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

39636

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

39637

getZeroVector(VT, Subtarget, DAG, DL), Movl,

39638

V.getOperand(2));

39639

}

39640

}

39641

39642

return SDValue();

39643

}

39644

case X86ISD::BLENDI: {

39645

SDValue N0 = N.getOperand(0);

39646

SDValue N1 = N.getOperand(1);

39647

39648

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

39649

// TODO: Handle MVT::v16i16 repeated blend mask.

39650

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

39651

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

39652

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

39653

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

39654

SrcVT.getScalarSizeInBits() >= 32) {

39655

unsigned BlendMask = N.getConstantOperandVal(2);

39656

unsigned Size = VT.getVectorNumElements();

39657

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

39658

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

39659

return DAG.getBitcast(

39660

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

39661

N1.getOperand(0),

39662

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

39663

}

39664

}

39665

return SDValue();

39666

}

39667

case X86ISD::SHUFP: {

39668

// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

39669

// This is a more relaxed shuffle combiner that can ignore oneuse limits.

39670

// TODO: Support types other than v4f32.

39671

if (VT == MVT::v4f32) {

39672

bool Updated = false;

39673

SmallVector<int> Mask;

39674

SmallVector<SDValue> Ops;

39675

if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&

39676

Ops.size() == 2) {

39677

for (int i = 0; i != 2; ++i) {

39678

SmallVector<SDValue> SubOps;

39679

SmallVector<int> SubMask, SubScaledMask;

39680

SDValue Sub = peekThroughBitcasts(Ops[i]);

39681

// TODO: Scaling might be easier if we specify the demanded elts.

39682

if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

39683

scaleShuffleElements(SubMask, 4, SubScaledMask) &&

39684

SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

39685

int Ofs = i * 2;

39686

Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

39687

Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

39688

Ops[i] = DAG.getBitcast(VT, SubOps[0]);

39689

Updated = true;

39690

}

39691

}

39692

}

39693

if (Updated) {

39694

for (int &M : Mask)

39695

M %= 4;

39696

Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

39697

return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

39698

}

39699

}

39700

return SDValue();

39701

}

39702

case X86ISD::VPERMI: {

39703

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

39704

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

39705

SDValue N0 = N.getOperand(0);

39706

SDValue N1 = N.getOperand(1);

39707

unsigned EltSizeInBits = VT.getScalarSizeInBits();

39708

if (N0.getOpcode() == ISD::BITCAST &&

39709

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

39710

SDValue Src = N0.getOperand(0);

39711

EVT SrcVT = Src.getValueType();

39712

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

39713

return DAG.getBitcast(VT, Res);

39714

}

39715

return SDValue();

39716

}

39717

case X86ISD::VPERM2X128: {

39718

// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

39719

SDValue LHS = N->getOperand(0);

39720

SDValue RHS = N->getOperand(1);

39721

if (LHS.getOpcode() == ISD::BITCAST &&

39722

(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

39723

EVT SrcVT = LHS.getOperand(0).getValueType();

39724

if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

39725

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

39726

DAG.getBitcast(SrcVT, LHS),

39727

DAG.getBitcast(SrcVT, RHS),

39728

N->getOperand(2)));

39729

}

39730

}

39731

39732

// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

39733

if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

39734

return Res;

39735

39736

// Fold vperm2x128 subvector shuffle with an inner concat pattern.

39737

// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

39738

auto FindSubVector128 = [&](unsigned Idx) {

39739

if (Idx > 3)

39740

return SDValue();

39741

SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

39742

SmallVector<SDValue> SubOps;

39743

if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)

39744

return SubOps[Idx & 1];

39745

unsigned NumElts = Src.getValueType().getVectorNumElements();

39746

if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

39747

Src.getOperand(1).getValueSizeInBits() == 128 &&

39748

Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

39749

return Src.getOperand(1);

39750

}

39751

return SDValue();

39752

};

39753

unsigned Imm = N.getConstantOperandVal(2);

39754

if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

39755

if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

39756

MVT SubVT = VT.getHalfNumVectorElementsVT();

39757

SubLo = DAG.getBitcast(SubVT, SubLo);

39758

SubHi = DAG.getBitcast(SubVT, SubHi);

39759

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

39760

}

39761

}

39762

return SDValue();

39763

}

39764

case X86ISD::PSHUFD:

39765

case X86ISD::PSHUFLW:

39766

case X86ISD::PSHUFHW:

39767

Mask = getPSHUFShuffleMask(N);

39768

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39768, __extension__ __PRETTY_FUNCTION__));

39769

break;

39770

case X86ISD::MOVSD:

39771

case X86ISD::MOVSH:

39772

case X86ISD::MOVSS: {

39773

SDValue N0 = N.getOperand(0);

39774

SDValue N1 = N.getOperand(1);

39775

39776

// Canonicalize scalar FPOps:

39777

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

39778

// If commutable, allow OP(N1[0], N0[0]).

39779

unsigned Opcode1 = N1.getOpcode();

39780

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

39781

Opcode1 == ISD::FDIV) {

39782

SDValue N10 = N1.getOperand(0);

39783

SDValue N11 = N1.getOperand(1);

39784

if (N10 == N0 ||

39785

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

39786

if (N10 != N0)

39787

std::swap(N10, N11);

39788

MVT SVT = VT.getVectorElementType();

39789

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

39790

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

39791

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

39792

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

39793

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

39794

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

39795

}

39796

}

39797

39798

return SDValue();

39799

}

39800

case X86ISD::INSERTPS: {

39801

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39801, __extension__
__PRETTY_FUNCTION__));

39802

SDValue Op0 = N.getOperand(0);

39803

SDValue Op1 = N.getOperand(1);

39804

unsigned InsertPSMask = N.getConstantOperandVal(2);

39805

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

39806

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

39807

unsigned ZeroMask = InsertPSMask & 0xF;

39808

39809

// If we zero out all elements from Op0 then we don't need to reference it.

39810

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

39811

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

39812

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

39813

39814

// If we zero out the element from Op1 then we don't need to reference it.

39815

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

39816

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

39817

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

39818

39819

// Attempt to merge insertps Op1 with an inner target shuffle node.

39820

SmallVector<int, 8> TargetMask1;

39821

SmallVector<SDValue, 2> Ops1;

39822

APInt KnownUndef1, KnownZero1;

39823

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

39824

KnownZero1)) {

39825

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

39826

// Zero/UNDEF insertion - zero out element and remove dependency.

39827

InsertPSMask |= (1u << DstIdx);

39828

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

39829

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

39830

}

39831

// Update insertps mask srcidx and reference the source input directly.

39832

int M = TargetMask1[SrcIdx];

39833

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39833, __extension__
__PRETTY_FUNCTION__));

39834

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

39835

Op1 = Ops1[M < 4 ? 0 : 1];

39836

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

39837

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

39838

}

39839

39840

// Attempt to merge insertps Op0 with an inner target shuffle node.

39841

SmallVector<int, 8> TargetMask0;

39842

SmallVector<SDValue, 2> Ops0;

39843

APInt KnownUndef0, KnownZero0;

39844

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

39845

KnownZero0)) {

39846

bool Updated = false;

39847

bool UseInput00 = false;

39848

bool UseInput01 = false;

39849

for (int i = 0; i != 4; ++i) {

39850

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

39851

// No change if element is already zero or the inserted element.

39852

continue;

39853

} else if (KnownUndef0[i] || KnownZero0[i]) {

39854

// If the target mask is undef/zero then we must zero the element.

39855

InsertPSMask |= (1u << i);

39856

Updated = true;

39857

continue;

39858

}

39859

39860

// The input vector element must be inline.

39861

int M = TargetMask0[i];

39862

if (M != i && M != (i + 4))

39863

return SDValue();

39864

39865

// Determine which inputs of the target shuffle we're using.

39866

UseInput00 |= (0 <= M && M < 4);

39867

UseInput01 |= (4 <= M);

39868

}

39869

39870

// If we're not using both inputs of the target shuffle then use the

39871

// referenced input directly.

39872

if (UseInput00 && !UseInput01) {

39873

Updated = true;

39874

Op0 = Ops0[0];

39875

} else if (!UseInput00 && UseInput01) {

39876

Updated = true;

39877

Op0 = Ops0[1];

39878

}

39879

39880

if (Updated)

39881

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

39882

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

39883

}

39884

39885

// If we're inserting an element from a vbroadcast load, fold the

39886

// load into the X86insertps instruction. We need to convert the scalar

39887

// load to a vector and clear the source lane of the INSERTPS control.

39888

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

39889

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

39890

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

39891

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

39892

MemIntr->getBasePtr(),

39893

MemIntr->getMemOperand());

39894

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

39895

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

39896

Load),

39897

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

39898

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

39899

return Insert;

39900

}

39901

}

39902

39903

return SDValue();

39904

}

39905

default:

39906

return SDValue();

39907

}

39908

39909

// Nuke no-op shuffles that show up after combining.

39910

if (isNoopShuffleMask(Mask))

39911

return N.getOperand(0);

39912

39913

// Look for simplifications involving one or two shuffle instructions.

39914

SDValue V = N.getOperand(0);

39915

switch (N.getOpcode()) {

39916

default:

39917

break;

39918

case X86ISD::PSHUFLW:

39919

case X86ISD::PSHUFHW:

39920

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39920, __extension__
__PRETTY_FUNCTION__));

39921

39922

// See if this reduces to a PSHUFD which is no more expensive and can

39923

// combine with more operations. Note that it has to at least flip the

39924

// dwords as otherwise it would have been removed as a no-op.

39925

if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {

39926

int DMask[] = {0, 1, 2, 3};

39927

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

39928

DMask[DOffset + 0] = DOffset + 1;

39929

DMask[DOffset + 1] = DOffset + 0;

39930

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

39931

V = DAG.getBitcast(DVT, V);

39932

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

39933

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

39934

return DAG.getBitcast(VT, V);

39935

}

39936

39937

// Look for shuffle patterns which can be implemented as a single unpack.

39938

// FIXME: This doesn't handle the location of the PSHUFD generically, and

39939

// only works when we have a PSHUFD followed by two half-shuffles.

39940

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

39941

(V.getOpcode() == X86ISD::PSHUFLW ||

39942

V.getOpcode() == X86ISD::PSHUFHW) &&

39943

V.getOpcode() != N.getOpcode() &&

39944

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

39945

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

39946

if (D.getOpcode() == X86ISD::PSHUFD) {

39947

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

39948

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

39949

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

39950

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

39951

int WordMask[8];

39952

for (int i = 0; i < 4; ++i) {

39953

WordMask[i + NOffset] = Mask[i] + NOffset;

39954

WordMask[i + VOffset] = VMask[i] + VOffset;

39955

}

39956

// Map the word mask through the DWord mask.

39957

int MappedMask[8];

39958

for (int i = 0; i < 8; ++i)

39959

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

39960

if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

39961

makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

39962

// We can replace all three shuffles with an unpack.

39963

V = DAG.getBitcast(VT, D.getOperand(0));

39964

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

39965

: X86ISD::UNPCKH,

39966

DL, VT, V, V);

39967

}

39968

}

39969

}

39970

39971

break;

39972

39973

case X86ISD::PSHUFD:

39974

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

39975

return NewN;

39976

39977

break;

39978

}

39979

39980

return SDValue();

39981

}

39982

39983

/// Checks if the shuffle mask takes subsequent elements

39984

/// alternately from two vectors.

39985

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

39986

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

39987

39988

int ParitySrc[2] = {-1, -1};

39989

unsigned Size = Mask.size();

39990

for (unsigned i = 0; i != Size; ++i) {

39991

int M = Mask[i];

39992

if (M < 0)

39993

continue;

39994

39995

// Make sure we are using the matching element from the input.

39996

if ((M % Size) != i)

39997

return false;

39998

39999

// Make sure we use the same input for all elements of the same parity.

40000

int Src = M / Size;

40001

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

40002

return false;

40003

ParitySrc[i % 2] = Src;

40004

}

40005

40006

// Make sure each input is used.

40007

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

40008

return false;

40009

40010

Op0Even = ParitySrc[0] == 0;

40011

return true;

40012

}

40013

40014

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

40015

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

40016

/// are written to the parameters \p Opnd0 and \p Opnd1.

40017

///

40018

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

40019

/// so it is easier to generically match. We also insert dummy vector shuffle

40020

/// nodes for the operands which explicitly discard the lanes which are unused

40021

/// by this operation to try to flow through the rest of the combiner

40022

/// the fact that they're unused.

40023

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

40024

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

40025

bool &IsSubAdd) {

40026

40027

EVT VT = N->getValueType(0);

40028

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40029

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

40030

!VT.getSimpleVT().isFloatingPoint())

40031

return false;

40032

40033

// We only handle target-independent shuffles.

40034

// FIXME: It would be easy and harmless to use the target shuffle mask

40035

// extraction tool to support more.

40036

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

40037

return false;

40038

40039

SDValue V1 = N->getOperand(0);

40040

SDValue V2 = N->getOperand(1);

40041

40042

// Make sure we have an FADD and an FSUB.

40043

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

40044

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

40045

V1.getOpcode() == V2.getOpcode())

40046

return false;

40047

40048

// If there are other uses of these operations we can't fold them.

40049

if (!V1->hasOneUse() || !V2->hasOneUse())

40050

return false;

40051

40052

// Ensure that both operations have the same operands. Note that we can

40053

// commute the FADD operands.

40054

SDValue LHS, RHS;

40055

if (V1.getOpcode() == ISD::FSUB) {

40056

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

40057

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

40058

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

40059

return false;

40060

} else {

40061

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40061, __extension__
__PRETTY_FUNCTION__));

40062

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

40063

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

40064

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

40065

return false;

40066

}

40067

40068

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

40069

bool Op0Even;

40070

if (!isAddSubOrSubAddMask(Mask, Op0Even))

40071

return false;

40072

40073

// It's a subadd if the vector in the even parity is an FADD.

40074

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

40075

: V2->getOpcode() == ISD::FADD;

40076

40077

Opnd0 = LHS;

40078

Opnd1 = RHS;

40079

return true;

40080

}

40081

40082

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

40083

static SDValue combineShuffleToFMAddSub(SDNode *N,

40084

const X86Subtarget &Subtarget,

40085

SelectionDAG &DAG) {

40086

// We only handle target-independent shuffles.

40087

// FIXME: It would be easy and harmless to use the target shuffle mask

40088

// extraction tool to support more.

40089

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

40090

return SDValue();

40091

40092

MVT VT = N->getSimpleValueType(0);

40093

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40094

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

40095

return SDValue();

40096

40097

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

40098

SDValue Op0 = N->getOperand(0);

40099

SDValue Op1 = N->getOperand(1);

40100

SDValue FMAdd = Op0, FMSub = Op1;

40101

if (FMSub.getOpcode() != X86ISD::FMSUB)

40102

std::swap(FMAdd, FMSub);

40103

40104

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

40105

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

40106

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

40107

FMAdd.getOperand(2) != FMSub.getOperand(2))

40108

return SDValue();

40109

40110

// Check for correct shuffle mask.

40111

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

40112

bool Op0Even;

40113

if (!isAddSubOrSubAddMask(Mask, Op0Even))

40114

return SDValue();

40115

40116

// FMAddSub takes zeroth operand from FMSub node.

40117

SDLoc DL(N);

40118

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

40119

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

40120

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

40121

FMAdd.getOperand(2));

40122

}

40123

40124

/// Try to combine a shuffle into a target-specific add-sub or

40125

/// mul-add-sub node.

40126

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

40127

const X86Subtarget &Subtarget,

40128

SelectionDAG &DAG) {

40129

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

40130

return V;

40131

40132

SDValue Opnd0, Opnd1;

40133

bool IsSubAdd;

40134

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

40135

return SDValue();

40136

40137

MVT VT = N->getSimpleValueType(0);

40138

SDLoc DL(N);

40139

40140

// Try to generate X86ISD::FMADDSUB node here.

40141

SDValue Opnd2;

40142

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

40143

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

40144

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

40145

}

40146

40147

if (IsSubAdd)

40148

return SDValue();

40149

40150

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

40151

// the ADDSUB idiom has been successfully recognized. There are no known

40152

// X86 targets with 512-bit ADDSUB instructions!

40153

if (VT.is512BitVector())

40154

return SDValue();

40155

40156

// Do not generate X86ISD::ADDSUB node for FP16's vector types even though

40157

// the ADDSUB idiom has been successfully recognized. There are no known

40158

// X86 targets with FP16 ADDSUB instructions!

40159

if (VT.getVectorElementType() == MVT::f16)

40160

return SDValue();

40161

40162

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

40163

}

40164

40165

// We are looking for a shuffle where both sources are concatenated with undef

40166

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

40167

// if we can express this as a single-source shuffle, that's preferable.

40168

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

40169

const X86Subtarget &Subtarget) {

40170

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

40171

return SDValue();

40172

40173

EVT VT = N->getValueType(0);

40174

40175

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

40176

if (!VT.is128BitVector() && !VT.is256BitVector())

40177

return SDValue();

40178

40179

if (VT.getVectorElementType() != MVT::i32 &&

40180

VT.getVectorElementType() != MVT::i64 &&

40181

VT.getVectorElementType() != MVT::f32 &&

40182

VT.getVectorElementType() != MVT::f64)

40183

return SDValue();

40184

40185

SDValue N0 = N->getOperand(0);

40186

SDValue N1 = N->getOperand(1);

40187

40188

// Check that both sources are concats with undef.

40189

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

40190

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

40191

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

40192

!N1.getOperand(1).isUndef())

40193

return SDValue();

40194

40195

// Construct the new shuffle mask. Elements from the first source retain their

40196

// index, but elements from the second source no longer need to skip an undef.

40197

SmallVector<int, 8> Mask;

40198

int NumElts = VT.getVectorNumElements();

40199

40200

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

40201

for (int Elt : SVOp->getMask())

40202

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

40203

40204

SDLoc DL(N);

40205

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

40206

N1.getOperand(0));

40207

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

40208

}

40209

40210

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

40211

/// low half of each source vector and does not set any high half elements in

40212

/// the destination vector, narrow the shuffle to half its original size.

40213

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

40214

if (!Shuf->getValueType(0).isSimple())

40215

return SDValue();

40216

MVT VT = Shuf->getSimpleValueType(0);

40217

if (!VT.is256BitVector() && !VT.is512BitVector())

40218

return SDValue();

40219

40220

// See if we can ignore all of the high elements of the shuffle.

40221

ArrayRef<int> Mask = Shuf->getMask();

40222

if (!isUndefUpperHalf(Mask))

40223

return SDValue();

40224

40225

// Check if the shuffle mask accesses only the low half of each input vector

40226

// (half-index output is 0 or 2).

40227

int HalfIdx1, HalfIdx2;

40228

SmallVector<int, 8> HalfMask(Mask.size() / 2);

40229

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

40230

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

40231

return SDValue();

40232

40233

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

40234

// The trick is knowing that all of the insert/extract are actually free

40235

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

40236

// of narrow inputs into a narrow output, and that is always cheaper than

40237

// the wide shuffle that we started with.

40238

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

40239

Shuf->getOperand(1), HalfMask, HalfIdx1,

40240

HalfIdx2, false, DAG, /*UseConcat*/true);

40241

}

40242

40243

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

40244

TargetLowering::DAGCombinerInfo &DCI,

40245

const X86Subtarget &Subtarget) {

40246

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

40247

if (SDValue V = narrowShuffle(Shuf, DAG))

40248

return V;

40249

40250

// If we have legalized the vector types, look for blends of FADD and FSUB

40251

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

40252

SDLoc dl(N);

40253

EVT VT = N->getValueType(0);

40254

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40255

if (TLI.isTypeLegal(VT))

40256

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

40257

return AddSub;

40258

40259

// Attempt to combine into a vector load/broadcast.

40260

if (SDValue LD = combineToConsecutiveLoads(

40261

VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

40262

return LD;

40263

40264

// For AVX2, we sometimes want to combine

40265

// (vector_shuffle <mask> (concat_vectors t1, undef)

40266

// (concat_vectors t2, undef))

40267

// Into:

40268

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

40269

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

40270

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

40271

return ShufConcat;

40272

40273

if (isTargetShuffle(N->getOpcode())) {

40274

SDValue Op(N, 0);

40275

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

40276

return Shuffle;

40277

40278

// Try recursively combining arbitrary sequences of x86 shuffle

40279

// instructions into higher-order shuffles. We do this after combining

40280

// specific PSHUF instruction sequences into their minimal form so that we

40281

// can evaluate how many specialized shuffle instructions are involved in

40282

// a particular chain.

40283

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

40284

return Res;

40285

40286

// Simplify source operands based on shuffle mask.

40287

// TODO - merge this into combineX86ShufflesRecursively.

40288

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

40289

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

40290

return SDValue(N, 0);

40291

40292

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

40293

// Perform this after other shuffle combines to allow inner shuffles to be

40294

// combined away first.

40295

if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))

40296

return BinOp;

40297

}

40298

40299

return SDValue();

40300

}

40301

40302

// Simplify variable target shuffle masks based on the demanded elements.

40303

// TODO: Handle DemandedBits in mask indices as well?

40304

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

40305

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

40306

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

40307

// If we're demanding all elements don't bother trying to simplify the mask.

40308

unsigned NumElts = DemandedElts.getBitWidth();

40309

if (DemandedElts.isAllOnes())

40310

return false;

40311

40312

SDValue Mask = Op.getOperand(MaskIndex);

40313

if (!Mask.hasOneUse())

40314

return false;

40315

40316

// Attempt to generically simplify the variable shuffle mask.

40317

APInt MaskUndef, MaskZero;

40318

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

40319

Depth + 1))

40320

return true;

40321

40322

// Attempt to extract+simplify a (constant pool load) shuffle mask.

40323

// TODO: Support other types from getTargetShuffleMaskIndices?

40324

SDValue BC = peekThroughOneUseBitcasts(Mask);

40325

EVT BCVT = BC.getValueType();

40326

auto *Load = dyn_cast<LoadSDNode>(BC);

40327

if (!Load)

40328

return false;

40329

40330

const Constant *C = getTargetConstantFromNode(Load);

40331

if (!C)

40332

return false;

40333

40334

Type *CTy = C->getType();

40335

if (!CTy->isVectorTy() ||

40336

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

40337

return false;

40338

40339

// Handle scaling for i64 elements on 32-bit targets.

40340

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

40341

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

40342

return false;

40343

unsigned Scale = NumCstElts / NumElts;

40344

40345

// Simplify mask if we have an undemanded element that is not undef.

40346

bool Simplified = false;

40347

SmallVector<Constant *, 32> ConstVecOps;

40348

for (unsigned i = 0; i != NumCstElts; ++i) {

40349

Constant *Elt = C->getAggregateElement(i);

40350

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

40351

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

40352

Simplified = true;

40353

continue;

40354

}

40355

ConstVecOps.push_back(Elt);

40356

}

40357

if (!Simplified)

40358

return false;

40359

40360

// Generate new constant pool entry + legalize immediately for the load.

40361

SDLoc DL(Op);

40362

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

40363

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

40364

SDValue NewMask = TLO.DAG.getLoad(

40365

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

40366

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

40367

Load->getAlign());

40368

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

40369

}

40370

40371

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

40372

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

40373

TargetLoweringOpt &TLO, unsigned Depth) const {

40374

int NumElts = DemandedElts.getBitWidth();

40375

unsigned Opc = Op.getOpcode();

40376

EVT VT = Op.getValueType();

40377

40378

// Handle special case opcodes.

40379

switch (Opc) {

40380

case X86ISD::PMULDQ:

40381

case X86ISD::PMULUDQ: {

40382

APInt LHSUndef, LHSZero;

40383

APInt RHSUndef, RHSZero;

40384

SDValue LHS = Op.getOperand(0);

40385

SDValue RHS = Op.getOperand(1);

40386

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

40387

Depth + 1))

40388

return true;

40389

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

40390

Depth + 1))

40391

return true;

40392

// Multiply by zero.

40393

KnownZero = LHSZero | RHSZero;

40394

break;

40395

}

40396

case X86ISD::VPMADDWD: {

40397

APInt LHSUndef, LHSZero;

40398

APInt RHSUndef, RHSZero;

40399

SDValue LHS = Op.getOperand(0);

40400

SDValue RHS = Op.getOperand(1);

40401

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);

40402

40403

if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

40404

Depth + 1))

40405

return true;

40406

if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

40407

Depth + 1))

40408

return true;

40409

40410

// TODO: Multiply by zero.

40411

40412

// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

40413

APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

40414

if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

40415

Depth + 1))

40416

return true;

40417

APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

40418

if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

40419

Depth + 1))

40420

return true;

40421

break;

40422

}

40423

case X86ISD::PSADBW: {

40424

SDValue LHS = Op.getOperand(0);

40425

SDValue RHS = Op.getOperand(1);

40426

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))

40427

LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))

40428

LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__))

40429

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40429, __extension__
__PRETTY_FUNCTION__));

40430

40431

// Aggressively peek through ops to get at the demanded elts.

40432

if (!DemandedElts.isAllOnes()) {

40433

unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

40434

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

40435

SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

40436

LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

40437

SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

40438

RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

40439

if (NewLHS || NewRHS) {

40440

NewLHS = NewLHS ? NewLHS : LHS;

40441

NewRHS = NewRHS ? NewRHS : RHS;

40442

return TLO.CombineTo(

40443

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

40444

}

40445

}

40446

break;

40447

}

40448

case X86ISD::VSHL:

40449

case X86ISD::VSRL:

40450

case X86ISD::VSRA: {

40451

// We only need the bottom 64-bits of the (128-bit) shift amount.

40452

SDValue Amt = Op.getOperand(1);

40453

MVT AmtVT = Amt.getSimpleValueType();

40454

assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40454, __extension__
__PRETTY_FUNCTION__));

40455

40456

// If we reuse the shift amount just for sse shift amounts then we know that

40457

// only the bottom 64-bits are only ever used.

40458

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

40459

unsigned UseOpc = Use->getOpcode();

40460

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

40461

UseOpc == X86ISD::VSRA) &&

40462

Use->getOperand(0) != Amt;

40463

});

40464

40465

APInt AmtUndef, AmtZero;

40466

unsigned NumAmtElts = AmtVT.getVectorNumElements();

40467

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

40468

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

40469

Depth + 1, AssumeSingleUse))

40470

return true;

40471

LLVM_FALLTHROUGH[[gnu::fallthrough]];

40472

}

40473

case X86ISD::VSHLI:

40474

case X86ISD::VSRLI:

40475

case X86ISD::VSRAI: {

40476

SDValue Src = Op.getOperand(0);

40477

APInt SrcUndef;

40478

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

40479

Depth + 1))

40480

return true;

40481

40482

// Aggressively peek through ops to get at the demanded elts.

40483

if (!DemandedElts.isAllOnes())

40484

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

40485

Src, DemandedElts, TLO.DAG, Depth + 1))

40486

return TLO.CombineTo(

40487

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

40488

break;

40489

}

40490

case X86ISD::VPSHA:

40491

case X86ISD::VPSHL:

40492

case X86ISD::VSHLV:

40493

case X86ISD::VSRLV:

40494

case X86ISD::VSRAV: {

40495

APInt LHSUndef, LHSZero;

40496

APInt RHSUndef, RHSZero;

40497

SDValue LHS = Op.getOperand(0);

40498

SDValue RHS = Op.getOperand(1);

40499

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

40500

Depth + 1))

40501

return true;

40502

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

40503

Depth + 1))

40504

return true;

40505

KnownZero = LHSZero;

40506

break;

40507

}

40508

case X86ISD::KSHIFTL: {

40509

SDValue Src = Op.getOperand(0);

40510

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

40511

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40511, __extension__
__PRETTY_FUNCTION__));

40512

unsigned ShiftAmt = Amt->getZExtValue();

40513

40514

if (ShiftAmt == 0)

40515

return TLO.CombineTo(Op, Src);

40516

40517

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

40518

// single shift. We can do this if the bottom bits (which are shifted

40519

// out) are never demanded.

40520

if (Src.getOpcode() == X86ISD::KSHIFTR) {

40521

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

40522

unsigned C1 = Src.getConstantOperandVal(1);

40523

unsigned NewOpc = X86ISD::KSHIFTL;

40524

int Diff = ShiftAmt - C1;

40525

if (Diff < 0) {

40526

Diff = -Diff;

40527

NewOpc = X86ISD::KSHIFTR;

40528

}

40529

40530

SDLoc dl(Op);

40531

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

40532

return TLO.CombineTo(

40533

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

40534

}

40535

}

40536

40537

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

40538

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

40539

Depth + 1))

40540

return true;

40541

40542

KnownUndef <<= ShiftAmt;

40543

KnownZero <<= ShiftAmt;

40544

KnownZero.setLowBits(ShiftAmt);

40545

break;

40546

}

40547

case X86ISD::KSHIFTR: {

40548

SDValue Src = Op.getOperand(0);

40549

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

40550

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40550, __extension__
__PRETTY_FUNCTION__));

40551

unsigned ShiftAmt = Amt->getZExtValue();

40552

40553

if (ShiftAmt == 0)

40554

return TLO.CombineTo(Op, Src);

40555

40556

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

40557

// single shift. We can do this if the top bits (which are shifted

40558

// out) are never demanded.

40559

if (Src.getOpcode() == X86ISD::KSHIFTL) {

40560

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

40561

unsigned C1 = Src.getConstantOperandVal(1);

40562

unsigned NewOpc = X86ISD::KSHIFTR;

40563

int Diff = ShiftAmt - C1;

40564

if (Diff < 0) {

40565

Diff = -Diff;

40566

NewOpc = X86ISD::KSHIFTL;

40567

}

40568

40569

SDLoc dl(Op);

40570

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

40571

return TLO.CombineTo(

40572

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

40573

}

40574

}

40575

40576

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

40577

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

40578

Depth + 1))

40579

return true;

40580

40581

KnownUndef.lshrInPlace(ShiftAmt);

40582

KnownZero.lshrInPlace(ShiftAmt);

40583

KnownZero.setHighBits(ShiftAmt);

40584

break;

40585

}

40586

case X86ISD::ANDNP: {

40587

// ANDNP = (~LHS & RHS);

40588

SDValue LHS = Op.getOperand(0);

40589

SDValue RHS = Op.getOperand(1);

40590

40591

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

40592

APInt UndefElts;

40593

SmallVector<APInt> EltBits;

40594

int NumElts = VT.getVectorNumElements();

40595

int EltSizeInBits = VT.getScalarSizeInBits();

40596

APInt OpBits = APInt::getAllOnes(EltSizeInBits);

40597

APInt OpElts = DemandedElts;

40598

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

40599

EltBits)) {

40600

OpBits.clearAllBits();

40601

OpElts.clearAllBits();

40602

for (int I = 0; I != NumElts; ++I)

40603

if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||

40604

(!Invert && !EltBits[I].isZero()))) {

40605

OpBits |= Invert ? ~EltBits[I] : EltBits[I];

40606

OpElts.setBit(I);

40607

}

40608

}

40609

return std::make_pair(OpBits, OpElts);

40610

};

40611

std::pair<APInt, APInt> DemandLHS = GetDemandedMasks(RHS);

40612

std::pair<APInt, APInt> DemandRHS = GetDemandedMasks(LHS, true);

40613

40614

APInt LHSUndef, LHSZero;

40615

APInt RHSUndef, RHSZero;

40616

if (SimplifyDemandedVectorElts(LHS, DemandLHS.second, LHSUndef, LHSZero,

40617

TLO, Depth + 1))

40618

return true;

40619

if (SimplifyDemandedVectorElts(RHS, DemandRHS.second, RHSUndef, RHSZero,

40620

TLO, Depth + 1))

40621

return true;

40622

40623

if (!DemandedElts.isAllOnes()) {

40624

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

40625

LHS, DemandLHS.first, DemandLHS.second, TLO.DAG, Depth + 1);

40626

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

40627

RHS, DemandRHS.first, DemandRHS.second, TLO.DAG, Depth + 1);

40628

if (NewLHS || NewRHS) {

40629

NewLHS = NewLHS ? NewLHS : LHS;

40630

NewRHS = NewRHS ? NewRHS : RHS;

40631

return TLO.CombineTo(

40632

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

40633

}

40634

}

40635

break;

40636

}

40637

case X86ISD::CVTSI2P:

40638

case X86ISD::CVTUI2P: {

40639

SDValue Src = Op.getOperand(0);

40640

MVT SrcVT = Src.getSimpleValueType();

40641

APInt SrcUndef, SrcZero;

40642

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

40643

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

40644

Depth + 1))

40645

return true;

40646

break;

40647

}

40648

case X86ISD::PACKSS:

40649

case X86ISD::PACKUS: {

40650

SDValue N0 = Op.getOperand(0);

40651

SDValue N1 = Op.getOperand(1);

40652

40653

APInt DemandedLHS, DemandedRHS;

40654

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

40655

40656

APInt LHSUndef, LHSZero;

40657

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

40658

Depth + 1))

40659

return true;

40660

APInt RHSUndef, RHSZero;

40661

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

40662

Depth + 1))

40663

return true;

40664

40665

// TODO - pass on known zero/undef.

40666

40667

// Aggressively peek through ops to get at the demanded elts.

40668

// TODO - we should do this for all target/faux shuffles ops.

40669

if (!DemandedElts.isAllOnes()) {

40670

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

40671

TLO.DAG, Depth + 1);

40672

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

40673

TLO.DAG, Depth + 1);

40674

if (NewN0 || NewN1) {

40675

NewN0 = NewN0 ? NewN0 : N0;

40676

NewN1 = NewN1 ? NewN1 : N1;

40677

return TLO.CombineTo(Op,

40678

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

40679

}

40680

}

40681

break;

40682

}

40683

case X86ISD::HADD:

40684

case X86ISD::HSUB:

40685

case X86ISD::FHADD:

40686

case X86ISD::FHSUB: {

40687

SDValue N0 = Op.getOperand(0);

40688

SDValue N1 = Op.getOperand(1);

40689

40690

APInt DemandedLHS, DemandedRHS;

40691

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

40692

40693

APInt LHSUndef, LHSZero;

40694

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

40695

Depth + 1))

40696

return true;

40697

APInt RHSUndef, RHSZero;

40698

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

40699

Depth + 1))

40700

return true;

40701

40702

// TODO - pass on known zero/undef.

40703

40704

// Aggressively peek through ops to get at the demanded elts.

40705

// TODO: Handle repeated operands.

40706

if (N0 != N1 && !DemandedElts.isAllOnes()) {

40707

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

40708

TLO.DAG, Depth + 1);

40709

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

40710

TLO.DAG, Depth + 1);

40711

if (NewN0 || NewN1) {

40712

NewN0 = NewN0 ? NewN0 : N0;

40713

NewN1 = NewN1 ? NewN1 : N1;

40714

return TLO.CombineTo(Op,

40715

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

40716

}

40717

}

40718

break;

40719

}

40720

case X86ISD::VTRUNC:

40721

case X86ISD::VTRUNCS:

40722

case X86ISD::VTRUNCUS: {

40723

SDValue Src = Op.getOperand(0);

40724

MVT SrcVT = Src.getSimpleValueType();

40725

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

40726

APInt SrcUndef, SrcZero;

40727

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

40728

Depth + 1))

40729

return true;

40730

KnownZero = SrcZero.zextOrTrunc(NumElts);

40731

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

40732

break;

40733

}

40734

case X86ISD::BLENDV: {

40735

APInt SelUndef, SelZero;

40736

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

40737

SelZero, TLO, Depth + 1))

40738

return true;

40739

40740

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

40741

APInt LHSUndef, LHSZero;

40742

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

40743

LHSZero, TLO, Depth + 1))

40744

return true;

40745

40746

APInt RHSUndef, RHSZero;

40747

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

40748

RHSZero, TLO, Depth + 1))

40749

return true;

40750

40751

KnownZero = LHSZero & RHSZero;

40752

KnownUndef = LHSUndef & RHSUndef;

40753

break;

40754

}

40755

case X86ISD::VZEXT_MOVL: {

40756

// If upper demanded elements are already zero then we have nothing to do.

40757

SDValue Src = Op.getOperand(0);

40758

APInt DemandedUpperElts = DemandedElts;

40759

DemandedUpperElts.clearLowBits(1);

40760

if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())

40761

return TLO.CombineTo(Op, Src);

40762

break;

40763

}

40764

case X86ISD::VBROADCAST: {

40765

SDValue Src = Op.getOperand(0);

40766

MVT SrcVT = Src.getSimpleValueType();

40767

if (!SrcVT.isVector())

40768

break;

40769

// Don't bother broadcasting if we just need the 0'th element.

40770

if (DemandedElts == 1) {

40771

if (Src.getValueType() != VT)

40772

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

40773

SDLoc(Op));

40774

return TLO.CombineTo(Op, Src);

40775

}

40776

APInt SrcUndef, SrcZero;

40777

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

40778

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

40779

Depth + 1))

40780

return true;

40781

// Aggressively peek through src to get at the demanded elt.

40782

// TODO - we should do this for all target/faux shuffles ops.

40783

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

40784

Src, SrcElts, TLO.DAG, Depth + 1))

40785

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

40786

break;

40787

}

40788

case X86ISD::VPERMV:

40789

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

40790

Depth))

40791

return true;

40792

break;

40793

case X86ISD::PSHUFB:

40794

case X86ISD::VPERMV3:

40795

case X86ISD::VPERMILPV:

40796

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

40797

Depth))

40798

return true;

40799

break;

40800

case X86ISD::VPPERM:

40801

case X86ISD::VPERMIL2:

40802

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

40803

Depth))

40804

return true;

40805

break;

40806

}

40807

40808

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

40809

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

40810

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

40811

if ((VT.is256BitVector() || VT.is512BitVector()) &&

40812

DemandedElts.lshr(NumElts / 2) == 0) {

40813

unsigned SizeInBits = VT.getSizeInBits();

40814

unsigned ExtSizeInBits = SizeInBits / 2;

40815

40816

// See if 512-bit ops only use the bottom 128-bits.

40817

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

40818

ExtSizeInBits = SizeInBits / 4;

40819

40820

switch (Opc) {

40821

// Scalar broadcast.

40822

case X86ISD::VBROADCAST: {

40823

SDLoc DL(Op);

40824

SDValue Src = Op.getOperand(0);

40825

if (Src.getValueSizeInBits() > ExtSizeInBits)

40826

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

40827

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

40828

ExtSizeInBits / VT.getScalarSizeInBits());

40829

SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

40830

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

40831

TLO.DAG, DL, ExtSizeInBits));

40832

}

40833

case X86ISD::VBROADCAST_LOAD: {

40834

SDLoc DL(Op);

40835

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

40836

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

40837

ExtSizeInBits / VT.getScalarSizeInBits());

40838

SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

40839

SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

40840

SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

40841

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

40842

MemIntr->getMemOperand());

40843

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

40844

Bcst.getValue(1));

40845

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

40846

TLO.DAG, DL, ExtSizeInBits));

40847

}

40848

// Subvector broadcast.

40849

case X86ISD::SUBV_BROADCAST_LOAD: {

40850

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

40851

EVT MemVT = MemIntr->getMemoryVT();

40852

if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

40853

SDLoc DL(Op);

40854

SDValue Ld =

40855

TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

40856

MemIntr->getBasePtr(), MemIntr->getMemOperand());

40857

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

40858

Ld.getValue(1));

40859

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

40860

TLO.DAG, DL, ExtSizeInBits));

40861

} else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

40862

SDLoc DL(Op);

40863

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

40864

ExtSizeInBits / VT.getScalarSizeInBits());

40865

if (SDValue BcstLd =

40866

getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

40867

return TLO.CombineTo(Op,

40868

insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

40869

TLO.DAG, DL, ExtSizeInBits));

40870

}

40871

break;

40872

}

40873

// Byte shifts by immediate.

40874

case X86ISD::VSHLDQ:

40875

case X86ISD::VSRLDQ:

40876

// Shift by uniform.

40877

case X86ISD::VSHL:

40878

case X86ISD::VSRL:

40879

case X86ISD::VSRA:

40880

// Shift by immediate.

40881

case X86ISD::VSHLI:

40882

case X86ISD::VSRLI:

40883

case X86ISD::VSRAI: {

40884

SDLoc DL(Op);

40885

SDValue Ext0 =

40886

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

40887

SDValue ExtOp =

40888

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

40889

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

40890

SDValue Insert =

40891

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

40892

return TLO.CombineTo(Op, Insert);

40893

}

40894

case X86ISD::VPERMI: {

40895

// Simplify PERMPD/PERMQ to extract_subvector.

40896

// TODO: This should be done in shuffle combining.

40897

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

40898

SmallVector<int, 4> Mask;

40899

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

40900

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

40901

SDLoc DL(Op);

40902

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

40903

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

40904

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

40905

return TLO.CombineTo(Op, Insert);

40906

}

40907

}

40908

break;

40909

}

40910

case X86ISD::VPERM2X128: {

40911

// Simplify VPERM2F128/VPERM2I128 to extract_subvector.

40912

SDLoc DL(Op);

40913

unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

40914

if (LoMask & 0x8)

40915

return TLO.CombineTo(

40916

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

40917

unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

40918

unsigned SrcIdx = (LoMask & 0x2) >> 1;

40919

SDValue ExtOp =

40920

extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

40921

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

40922

SDValue Insert =

40923

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

40924

return TLO.CombineTo(Op, Insert);

40925

}

40926

// Zero upper elements.

40927

case X86ISD::VZEXT_MOVL:

40928

// Target unary shuffles by immediate:

40929

case X86ISD::PSHUFD:

40930

case X86ISD::PSHUFLW:

40931

case X86ISD::PSHUFHW:

40932

case X86ISD::VPERMILPI:

40933

// (Non-Lane Crossing) Target Shuffles.

40934

case X86ISD::VPERMILPV:

40935

case X86ISD::VPERMIL2:

40936

case X86ISD::PSHUFB:

40937

case X86ISD::UNPCKL:

40938

case X86ISD::UNPCKH:

40939

case X86ISD::BLENDI:

40940

// Integer ops.

40941

case X86ISD::PACKSS:

40942

case X86ISD::PACKUS:

40943

// Horizontal Ops.

40944

case X86ISD::HADD:

40945

case X86ISD::HSUB:

40946

case X86ISD::FHADD:

40947

case X86ISD::FHSUB: {

40948

SDLoc DL(Op);

40949

SmallVector<SDValue, 4> Ops;

40950

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

40951

SDValue SrcOp = Op.getOperand(i);

40952

EVT SrcVT = SrcOp.getValueType();

40953

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40954, __extension__
__PRETTY_FUNCTION__))

40954

"Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40954, __extension__
__PRETTY_FUNCTION__));

40955

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

40956

ExtSizeInBits)

40957

: SrcOp);

40958

}

40959

MVT ExtVT = VT.getSimpleVT();

40960

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

40961

ExtSizeInBits / ExtVT.getScalarSizeInBits());

40962

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

40963

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

40964

SDValue Insert =

40965

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

40966

return TLO.CombineTo(Op, Insert);

40967

}

40968

}

40969

}

40970

40971

// For broadcasts, unless we *only* demand the 0'th element,

40972

// stop attempts at simplification here, we aren't going to improve things,

40973

// this is better than any potential shuffle.

40974

if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())

40975

return false;

40976

40977

// Get target/faux shuffle mask.

40978

APInt OpUndef, OpZero;

40979

SmallVector<int, 64> OpMask;

40980

SmallVector<SDValue, 2> OpInputs;

40981

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

40982

OpZero, TLO.DAG, Depth, false))

40983

return false;

40984

40985

// Shuffle inputs must be the same size as the result.

40986

if (OpMask.size() != (unsigned)NumElts ||

40987

llvm::any_of(OpInputs, [VT](SDValue V) {

40988

return VT.getSizeInBits() != V.getValueSizeInBits() ||

40989

!V.getValueType().isVector();

40990

}))

40991

return false;

40992

40993

KnownZero = OpZero;

40994

KnownUndef = OpUndef;

40995

40996

// Check if shuffle mask can be simplified to undef/zero/identity.

40997

int NumSrcs = OpInputs.size();

40998

for (int i = 0; i != NumElts; ++i)

40999

if (!DemandedElts[i])

41000

OpMask[i] = SM_SentinelUndef;

41001

41002

if (isUndefInRange(OpMask, 0, NumElts)) {

41003

KnownUndef.setAllBits();

41004

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

41005

}

41006

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

41007

KnownZero.setAllBits();

41008

return TLO.CombineTo(

41009

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

41010

}

41011

for (int Src = 0; Src != NumSrcs; ++Src)

41012

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

41013

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

41014

41015

// Attempt to simplify inputs.

41016

for (int Src = 0; Src != NumSrcs; ++Src) {

41017

// TODO: Support inputs of different types.

41018

if (OpInputs[Src].getValueType() != VT)

41019

continue;

41020

41021

int Lo = Src * NumElts;

41022

APInt SrcElts = APInt::getZero(NumElts);

41023

for (int i = 0; i != NumElts; ++i)

41024

if (DemandedElts[i]) {

41025

int M = OpMask[i] - Lo;

41026

if (0 <= M && M < NumElts)

41027

SrcElts.setBit(M);

41028

}

41029

41030

// TODO - Propagate input undef/zero elts.

41031

APInt SrcUndef, SrcZero;

41032

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

41033

TLO, Depth + 1))

41034

return true;

41035

}

41036

41037

// If we don't demand all elements, then attempt to combine to a simpler

41038

// shuffle.

41039

// We need to convert the depth to something combineX86ShufflesRecursively

41040

// can handle - so pretend its Depth == 0 again, and reduce the max depth

41041

// to match. This prevents combineX86ShuffleChain from returning a

41042

// combined shuffle that's the same as the original root, causing an

41043

// infinite loop.

41044

if (!DemandedElts.isAllOnes()) {

41045

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41045, __extension__
__PRETTY_FUNCTION__));

41046

41047

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

41048

for (int i = 0; i != NumElts; ++i)

41049

if (DemandedElts[i])

41050

DemandedMask[i] = i;

41051

41052

SDValue NewShuffle = combineX86ShufflesRecursively(

41053

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

41054

/*HasVarMask*/ false,

41055

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,

41056

Subtarget);

41057

if (NewShuffle)

41058

return TLO.CombineTo(Op, NewShuffle);

41059

}

41060

41061

return false;

41062

}

41063

41064

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

41065

SDValue Op, const APInt &OriginalDemandedBits,

41066

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

41067

unsigned Depth) const {

41068

EVT VT = Op.getValueType();

41069

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

41070

unsigned Opc = Op.getOpcode();

41071

switch(Opc) {

41072

case X86ISD::VTRUNC: {

41073

KnownBits KnownOp;

41074

SDValue Src = Op.getOperand(0);

41075

MVT SrcVT = Src.getSimpleValueType();

41076

41077

// Simplify the input, using demanded bit information.

41078

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

41079

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

41080

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

41081

return true;

41082

break;

41083

}

41084

case X86ISD::PMULDQ:

41085

case X86ISD::PMULUDQ: {

41086

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

41087

KnownBits KnownOp;

41088

SDValue LHS = Op.getOperand(0);

41089

SDValue RHS = Op.getOperand(1);

41090

// FIXME: Can we bound this better?

41091

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

41092

if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,

41093

TLO, Depth + 1))

41094

return true;

41095

if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,

41096

TLO, Depth + 1))

41097

return true;

41098

41099

// Aggressively peek through ops to get at the demanded low bits.

41100

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

41101

LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

41102

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

41103

RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

41104

if (DemandedLHS || DemandedRHS) {

41105

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

41106

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

41107

return TLO.CombineTo(

41108

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

41109

}

41110

break;

41111

}

41112

case X86ISD::VSHLI: {

41113

SDValue Op0 = Op.getOperand(0);

41114

41115

unsigned ShAmt = Op.getConstantOperandVal(1);

41116

if (ShAmt >= BitWidth)

41117

break;

41118

41119

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

41120

41121

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

41122

// single shift. We can do this if the bottom bits (which are shifted

41123

// out) are never demanded.

41124

if (Op0.getOpcode() == X86ISD::VSRLI &&

41125

OriginalDemandedBits.countTrailingZeros() >= ShAmt) {

41126

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

41127

if (Shift2Amt < BitWidth) {

41128

int Diff = ShAmt - Shift2Amt;

41129

if (Diff == 0)

41130

return TLO.CombineTo(Op, Op0.getOperand(0));

41131

41132

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

41133

SDValue NewShift = TLO.DAG.getNode(

41134

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

41135

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

41136

return TLO.CombineTo(Op, NewShift);

41137

}

41138

}

41139

41140

// If we are only demanding sign bits then we can use the shift source directly.

41141

unsigned NumSignBits =

41142

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

41143

unsigned UpperDemandedBits =

41144

BitWidth - OriginalDemandedBits.countTrailingZeros();

41145

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

41146

return TLO.CombineTo(Op, Op0);

41147

41148

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

41149

TLO, Depth + 1))

41150

return true;

41151

41152

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41152, __extension__
__PRETTY_FUNCTION__));

41153

Known.Zero <<= ShAmt;

41154

Known.One <<= ShAmt;

41155

41156

// Low bits known zero.

41157

Known.Zero.setLowBits(ShAmt);

41158

return false;

41159

}

41160

case X86ISD::VSRLI: {

41161

unsigned ShAmt = Op.getConstantOperandVal(1);

41162

if (ShAmt >= BitWidth)

41163

break;

41164

41165

APInt DemandedMask = OriginalDemandedBits << ShAmt;

41166

41167

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

41168

OriginalDemandedElts, Known, TLO, Depth + 1))

41169

return true;

41170

41171

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41171, __extension__
__PRETTY_FUNCTION__));

41172

Known.Zero.lshrInPlace(ShAmt);

41173

Known.One.lshrInPlace(ShAmt);

41174

41175

// High bits known zero.

41176

Known.Zero.setHighBits(ShAmt);

41177

return false;

41178

}

41179

case X86ISD::VSRAI: {

41180

SDValue Op0 = Op.getOperand(0);

41181

SDValue Op1 = Op.getOperand(1);

41182

41183

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

41184

if (ShAmt >= BitWidth)

41185

break;

41186

41187

APInt DemandedMask = OriginalDemandedBits << ShAmt;

41188

41189

// If we just want the sign bit then we don't need to shift it.

41190

if (OriginalDemandedBits.isSignMask())

41191

return TLO.CombineTo(Op, Op0);

41192

41193

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

41194

if (Op0.getOpcode() == X86ISD::VSHLI &&

41195

Op.getOperand(1) == Op0.getOperand(1)) {

41196

SDValue Op00 = Op0.getOperand(0);

41197

unsigned NumSignBits =

41198

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

41199

if (ShAmt < NumSignBits)

41200

return TLO.CombineTo(Op, Op00);

41201

}

41202

41203

// If any of the demanded bits are produced by the sign extension, we also

41204

// demand the input sign bit.

41205

if (OriginalDemandedBits.countLeadingZeros() < ShAmt)

41206

DemandedMask.setSignBit();

41207

41208

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

41209

TLO, Depth + 1))

41210

return true;

41211

41212

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41212, __extension__
__PRETTY_FUNCTION__));

41213

Known.Zero.lshrInPlace(ShAmt);

41214

Known.One.lshrInPlace(ShAmt);

41215

41216

// If the input sign bit is known to be zero, or if none of the top bits

41217

// are demanded, turn this into an unsigned shift right.

41218

if (Known.Zero[BitWidth - ShAmt - 1] ||

41219

OriginalDemandedBits.countLeadingZeros() >= ShAmt)

41220

return TLO.CombineTo(

41221

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

41222

41223

// High bits are known one.

41224

if (Known.One[BitWidth - ShAmt - 1])

41225

Known.One.setHighBits(ShAmt);

41226

return false;

41227

}

41228

case X86ISD::BLENDV: {

41229

SDValue Sel = Op.getOperand(0);

41230

SDValue LHS = Op.getOperand(1);

41231

SDValue RHS = Op.getOperand(2);

41232

41233

APInt SignMask = APInt::getSignMask(BitWidth);

41234

SDValue NewSel = SimplifyMultipleUseDemandedBits(

41235

Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

41236

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

41237

LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

41238

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

41239

RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

41240

41241

if (NewSel || NewLHS || NewRHS) {

41242

NewSel = NewSel ? NewSel : Sel;

41243

NewLHS = NewLHS ? NewLHS : LHS;

41244

NewRHS = NewRHS ? NewRHS : RHS;

41245

return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

41246

NewSel, NewLHS, NewRHS));

41247

}

41248

break;

41249

}

41250

case X86ISD::PEXTRB:

41251

case X86ISD::PEXTRW: {

41252

SDValue Vec = Op.getOperand(0);

41253

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

41254

MVT VecVT = Vec.getSimpleValueType();

41255

unsigned NumVecElts = VecVT.getVectorNumElements();

41256

41257

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

41258

unsigned Idx = CIdx->getZExtValue();

41259

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

41260

41261

// If we demand no bits from the vector then we must have demanded

41262

// bits from the implict zext - simplify to zero.

41263

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

41264

if (DemandedVecBits == 0)

41265

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

41266

41267

APInt KnownUndef, KnownZero;

41268

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

41269

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

41270

KnownZero, TLO, Depth + 1))

41271

return true;

41272

41273

KnownBits KnownVec;

41274

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

41275

KnownVec, TLO, Depth + 1))

41276

return true;

41277

41278

if (SDValue V = SimplifyMultipleUseDemandedBits(

41279

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

41280

return TLO.CombineTo(

41281

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

41282

41283

Known = KnownVec.zext(BitWidth);

41284

return false;

41285

}

41286

break;

41287

}

41288

case X86ISD::PINSRB:

41289

case X86ISD::PINSRW: {

41290

SDValue Vec = Op.getOperand(0);

41291

SDValue Scl = Op.getOperand(1);

41292

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

41293

MVT VecVT = Vec.getSimpleValueType();

41294

41295

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

41296

unsigned Idx = CIdx->getZExtValue();

41297

if (!OriginalDemandedElts[Idx])

41298

return TLO.CombineTo(Op, Vec);

41299

41300

KnownBits KnownVec;

41301

APInt DemandedVecElts(OriginalDemandedElts);

41302

DemandedVecElts.clearBit(Idx);

41303

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

41304

KnownVec, TLO, Depth + 1))

41305

return true;

41306

41307

KnownBits KnownScl;

41308

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

41309

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

41310

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

41311

return true;

41312

41313

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

41314

Known = KnownBits::commonBits(KnownVec, KnownScl);

41315

return false;

41316

}

41317

break;

41318

}

41319

case X86ISD::PACKSS:

41320

// PACKSS saturates to MIN/MAX integer values. So if we just want the

41321

// sign bit then we can just ask for the source operands sign bit.

41322

// TODO - add known bits handling.

41323

if (OriginalDemandedBits.isSignMask()) {

41324

APInt DemandedLHS, DemandedRHS;

41325

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

41326

41327

KnownBits KnownLHS, KnownRHS;

41328

APInt SignMask = APInt::getSignMask(BitWidth * 2);

41329

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

41330

KnownLHS, TLO, Depth + 1))

41331

return true;

41332

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

41333

KnownRHS, TLO, Depth + 1))

41334

return true;

41335

41336

// Attempt to avoid multi-use ops if we don't need anything from them.

41337

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

41338

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

41339

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

41340

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

41341

if (DemandedOp0 || DemandedOp1) {

41342

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

41343

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

41344

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

41345

}

41346

}

41347

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

41348

break;

41349

case X86ISD::VBROADCAST: {

41350

SDValue Src = Op.getOperand(0);

41351

MVT SrcVT = Src.getSimpleValueType();

41352

APInt DemandedElts = APInt::getOneBitSet(

41353

SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

41354

if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

41355

TLO, Depth + 1))

41356

return true;

41357

// If we don't need the upper bits, attempt to narrow the broadcast source.

41358

// Don't attempt this on AVX512 as it might affect broadcast folding.

41359

// TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

41360

if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

41361

OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&

41362

Src->hasOneUse()) {

41363

MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

41364

SDValue NewSrc =

41365

TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

41366

MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

41367

SDValue NewBcst =

41368

TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

41369

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

41370

}

41371

break;

41372

}

41373

case X86ISD::PCMPGT:

41374

// icmp sgt(0, R) == ashr(R, BitWidth-1).

41375

// iff we only need the sign bit then we can use R directly.

41376

if (OriginalDemandedBits.isSignMask() &&

41377

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

41378

return TLO.CombineTo(Op, Op.getOperand(1));

41379

break;

41380

case X86ISD::MOVMSK: {

41381

SDValue Src = Op.getOperand(0);

41382

MVT SrcVT = Src.getSimpleValueType();

41383

unsigned SrcBits = SrcVT.getScalarSizeInBits();

41384

unsigned NumElts = SrcVT.getVectorNumElements();

41385

41386

// If we don't need the sign bits at all just return zero.

41387

if (OriginalDemandedBits.countTrailingZeros() >= NumElts)

41388

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

41389

41390

// See if we only demand bits from the lower 128-bit vector.

41391

if (SrcVT.is256BitVector() &&

41392

OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

41393

SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

41394

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

41395

}

41396

41397

// Only demand the vector elements of the sign bits we need.

41398

APInt KnownUndef, KnownZero;

41399

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

41400

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

41401

TLO, Depth + 1))

41402

return true;

41403

41404

Known.Zero = KnownZero.zextOrSelf(BitWidth);

41405

Known.Zero.setHighBits(BitWidth - NumElts);

41406

41407

// MOVMSK only uses the MSB from each vector element.

41408

KnownBits KnownSrc;

41409

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

41410

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

41411

Depth + 1))

41412

return true;

41413

41414

if (KnownSrc.One[SrcBits - 1])

41415

Known.One.setLowBits(NumElts);

41416

else if (KnownSrc.Zero[SrcBits - 1])

41417

Known.Zero.setLowBits(NumElts);

41418

41419

// Attempt to avoid multi-use os if we don't need anything from it.

41420

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

41421

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

41422

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

41423

return false;

41424

}

41425

case X86ISD::BEXTR:

41426

case X86ISD::BEXTRI: {

41427

SDValue Op0 = Op.getOperand(0);

41428

SDValue Op1 = Op.getOperand(1);

41429

41430

// Only bottom 16-bits of the control bits are required.

41431

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

41432

// NOTE: SimplifyDemandedBits won't do this for constants.

41433

uint64_t Val1 = Cst1->getZExtValue();

41434

uint64_t MaskedVal1 = Val1 & 0xFFFF;

41435

if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

41436

SDLoc DL(Op);

41437

return TLO.CombineTo(

41438

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

41439

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

41440

}

41441

41442

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

41443

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

41444

41445

// If the length is 0, the result is 0.

41446

if (Length == 0) {

41447

Known.setAllZero();

41448

return false;

41449

}

41450

41451

if ((Shift + Length) <= BitWidth) {

41452

APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

41453

if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

41454

return true;

41455

41456

Known = Known.extractBits(Length, Shift);

41457

Known = Known.zextOrTrunc(BitWidth);

41458

return false;

41459

}

41460

} else {

41461

assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41461, __extension__
__PRETTY_FUNCTION__));

41462

KnownBits Known1;

41463

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

41464

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

41465

return true;

41466

41467

// If the length is 0, replace with 0.

41468

KnownBits LengthBits = Known1.extractBits(8, 8);

41469

if (LengthBits.isZero())

41470

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

41471

}

41472

41473

break;

41474

}

41475

case X86ISD::PDEP: {

41476

SDValue Op0 = Op.getOperand(0);

41477

SDValue Op1 = Op.getOperand(1);

41478

41479

unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();

41480

APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);

41481

41482

// If the demanded bits has leading zeroes, we don't demand those from the

41483

// mask.

41484

if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

41485

return true;

41486

41487

// The number of possible 1s in the mask determines the number of LSBs of

41488

// operand 0 used. Undemanded bits from the mask don't matter so filter

41489

// them before counting.

41490

KnownBits Known2;

41491

uint64_t Count = (~Known.Zero & LoMask).countPopulation();

41492

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

41493

if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

41494

return true;

41495

41496

// Zeroes are retained from the mask, but not ones.

41497

Known.One.clearAllBits();

41498

// The result will have at least as many trailing zeros as the non-mask

41499

// operand since bits can only map to the same or higher bit position.

41500

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

41501

return false;

41502

}

41503

}

41504

41505

return TargetLowering::SimplifyDemandedBitsForTargetNode(

41506

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

41507

}

41508

41509

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

41510

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

41511

SelectionDAG &DAG, unsigned Depth) const {

41512

int NumElts = DemandedElts.getBitWidth();

41513

unsigned Opc = Op.getOpcode();

41514

EVT VT = Op.getValueType();

41515

41516

switch (Opc) {

41517

case X86ISD::PINSRB:

41518

case X86ISD::PINSRW: {

41519

// If we don't demand the inserted element, return the base vector.

41520

SDValue Vec = Op.getOperand(0);

41521

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

41522

MVT VecVT = Vec.getSimpleValueType();

41523

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

41524

!DemandedElts[CIdx->getZExtValue()])

41525

return Vec;

41526

break;

41527

}

41528

case X86ISD::VSHLI: {

41529

// If we are only demanding sign bits then we can use the shift source

41530

// directly.

41531

SDValue Op0 = Op.getOperand(0);

41532

unsigned ShAmt = Op.getConstantOperandVal(1);

41533

unsigned BitWidth = DemandedBits.getBitWidth();

41534

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

41535

unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();

41536

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

41537

return Op0;

41538

break;

41539

}

41540

case X86ISD::VSRAI:

41541

// iff we only need the sign bit then we can use the source directly.

41542

// TODO: generalize where we only demand extended signbits.

41543

if (DemandedBits.isSignMask())

41544

return Op.getOperand(0);

41545

break;

41546

case X86ISD::PCMPGT:

41547

// icmp sgt(0, R) == ashr(R, BitWidth-1).

41548

// iff we only need the sign bit then we can use R directly.

41549

if (DemandedBits.isSignMask() &&

41550

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

41551

return Op.getOperand(1);

41552

break;

41553

}

41554

41555

APInt ShuffleUndef, ShuffleZero;

41556

SmallVector<int, 16> ShuffleMask;

41557

SmallVector<SDValue, 2> ShuffleOps;

41558

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

41559

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

41560

// If all the demanded elts are from one operand and are inline,

41561

// then we can use the operand directly.

41562

int NumOps = ShuffleOps.size();

41563

if (ShuffleMask.size() == (unsigned)NumElts &&

41564

llvm::all_of(ShuffleOps, [VT](SDValue V) {

41565

return VT.getSizeInBits() == V.getValueSizeInBits();

41566

})) {

41567

41568

if (DemandedElts.isSubsetOf(ShuffleUndef))

41569

return DAG.getUNDEF(VT);

41570

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

41571

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

41572

41573

// Bitmask that indicates which ops have only been accessed 'inline'.

41574

APInt IdentityOp = APInt::getAllOnes(NumOps);

41575

for (int i = 0; i != NumElts; ++i) {

41576

int M = ShuffleMask[i];

41577

if (!DemandedElts[i] || ShuffleUndef[i])

41578

continue;

41579

int OpIdx = M / NumElts;

41580

int EltIdx = M % NumElts;

41581

if (M < 0 || EltIdx != i) {

41582

IdentityOp.clearAllBits();

41583

break;

41584

}

41585

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

41586

if (IdentityOp == 0)

41587

break;

41588

}

41589

assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41590, __extension__
__PRETTY_FUNCTION__))

41590

"Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41590, __extension__
__PRETTY_FUNCTION__));

41591

41592

if (IdentityOp != 0)

41593

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);

41594

}

41595

}

41596

41597

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

41598

Op, DemandedBits, DemandedElts, DAG, Depth);

41599

}

41600

41601

bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

41602

const APInt &DemandedElts,

41603

APInt &UndefElts,

41604

unsigned Depth) const {

41605

unsigned NumElts = DemandedElts.getBitWidth();

41606

unsigned Opc = Op.getOpcode();

41607

41608

switch (Opc) {

41609

case X86ISD::VBROADCAST:

41610

case X86ISD::VBROADCAST_LOAD:

41611

UndefElts = APInt::getNullValue(NumElts);

41612

return true;

41613

}

41614

41615

return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

41616

Depth);

41617

}

41618

41619

// Helper to peek through bitops/trunc/setcc to determine size of source vector.

41620

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

41621

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

41622

bool AllowTruncate) {

41623

switch (Src.getOpcode()) {

41624

case ISD::TRUNCATE:

41625

if (!AllowTruncate)

41626

return false;

41627

LLVM_FALLTHROUGH[[gnu::fallthrough]];

41628

case ISD::SETCC:

41629

return Src.getOperand(0).getValueSizeInBits() == Size;

41630

case ISD::AND:

41631

case ISD::XOR:

41632

case ISD::OR:

41633

return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&

41634

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);

41635

}

41636

return false;

41637

}

41638

41639

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

41640

static unsigned getAltBitOpcode(unsigned Opcode) {

41641

switch(Opcode) {

41642

case ISD::AND: return X86ISD::FAND;

41643

case ISD::OR: return X86ISD::FOR;

41644

case ISD::XOR: return X86ISD::FXOR;

41645

case X86ISD::ANDNP: return X86ISD::FANDN;

41646

}

41647

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41647);

41648

}

41649

41650

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

41651

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

41652

const SDLoc &DL) {

41653

EVT SrcVT = Src.getValueType();

41654

if (SrcVT != MVT::v4i1)

41655

return SDValue();

41656

41657

switch (Src.getOpcode()) {

41658

case ISD::SETCC:

41659

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

41660

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

41661

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

41662

SDValue Op0 = Src.getOperand(0);

41663

if (ISD::isNormalLoad(Op0.getNode()))

41664

return DAG.getBitcast(MVT::v4f32, Op0);

41665

if (Op0.getOpcode() == ISD::BITCAST &&

41666

Op0.getOperand(0).getValueType() == MVT::v4f32)

41667

return Op0.getOperand(0);

41668

}

41669

break;

41670

case ISD::AND:

41671

case ISD::XOR:

41672

case ISD::OR: {

41673

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

41674

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

41675

if (Op0 && Op1)

41676

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

41677

Op1);

41678

break;

41679

}

41680

}

41681

return SDValue();

41682

}

41683

41684

// Helper to push sign extension of vXi1 SETCC result through bitops.

41685

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

41686

SDValue Src, const SDLoc &DL) {

41687

switch (Src.getOpcode()) {

41688

case ISD::SETCC:

41689

case ISD::TRUNCATE:

41690

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

41691

case ISD::AND:

41692

case ISD::XOR:

41693

case ISD::OR:

41694

return DAG.getNode(

41695

Src.getOpcode(), DL, SExtVT,

41696

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

41697

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

41698

}

41699

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41699);

41700

}

41701

41702

// Try to match patterns such as

41703

// (i16 bitcast (v16i1 x))

41704

// ->

41705

// (i16 movmsk (16i8 sext (v16i1 x)))

41706

// before the illegal vector is scalarized on subtargets that don't have legal

41707

// vxi1 types.

41708

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

41709

const SDLoc &DL,

41710

const X86Subtarget &Subtarget) {

41711

EVT SrcVT = Src.getValueType();

41712

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

41713

return SDValue();

41714

41715

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

41716

// legalization destroys the v4i32 type.

41717

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

41718

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

41719

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

41720

DAG.getBitcast(MVT::v4f32, V));

41721

return DAG.getZExtOrTrunc(V, DL, VT);

41722

}

41723

}

41724

41725

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

41726

// movmskb even with avx512. This will be better than truncating to vXi1 and

41727

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

41728

// vpcmpeqb/vpcmpgtb.

41729

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

41730

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

41731

Src.getOperand(0).getValueType() == MVT::v32i8 ||

41732

Src.getOperand(0).getValueType() == MVT::v64i8);

41733

41734

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

41735

// directly with vpmovmskb/vmovmskps/vmovmskpd.

41736

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

41737

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

41738

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

41739

EVT CmpVT = Src.getOperand(0).getValueType();

41740

EVT EltVT = CmpVT.getVectorElementType();

41741

if (CmpVT.getSizeInBits() <= 256 &&

41742

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

41743

PreferMovMsk = true;

41744

}

41745

41746

// With AVX512 vxi1 types are legal and we prefer using k-regs.

41747

// MOVMSK is supported in SSE2 or later.

41748

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

41749

return SDValue();

41750

41751

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

41752

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

41753

// v8i16 and v16i16.

41754

// For these two cases, we can shuffle the upper element bytes to a

41755

// consecutive sequence at the start of the vector and treat the results as

41756

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

41757

// for v16i16 this is not the case, because the shuffle is expensive, so we

41758

// avoid sign-extending to this type entirely.

41759

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

41760

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

41761

MVT SExtVT;

41762

bool PropagateSExt = false;

41763

switch (SrcVT.getSimpleVT().SimpleTy) {

41764

default:

41765

return SDValue();

41766

case MVT::v2i1:

41767

SExtVT = MVT::v2i64;

41768

break;

41769

case MVT::v4i1:

41770

SExtVT = MVT::v4i32;

41771

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

41772

// sign-extend to a 256-bit operation to avoid truncation.

41773

if (Subtarget.hasAVX() &&

41774

checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {

41775

SExtVT = MVT::v4i64;

41776

PropagateSExt = true;

41777

}

41778

break;

41779

case MVT::v8i1:

41780

SExtVT = MVT::v8i16;

41781

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

41782

// sign-extend to a 256-bit operation to match the compare.

41783

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

41784

// 256-bit because the shuffle is cheaper than sign extending the result of

41785

// the compare.

41786

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||

41787

checkBitcastSrcVectorSize(Src, 512, true))) {

41788

SExtVT = MVT::v8i32;

41789

PropagateSExt = true;

41790

}

41791

break;

41792

case MVT::v16i1:

41793

SExtVT = MVT::v16i8;

41794

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

41795

// it is not profitable to sign-extend to 256-bit because this will

41796

// require an extra cross-lane shuffle which is more expensive than

41797

// truncating the result of the compare to 128-bits.

41798

break;

41799

case MVT::v32i1:

41800

SExtVT = MVT::v32i8;

41801

break;

41802

case MVT::v64i1:

41803

// If we have AVX512F, but not AVX512BW and the input is truncated from

41804

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

41805

if (Subtarget.hasAVX512()) {

41806

if (Subtarget.hasBWI())

41807

return SDValue();

41808

SExtVT = MVT::v64i8;

41809

break;

41810

}

41811

// Split if this is a <64 x i8> comparison result.

41812

if (checkBitcastSrcVectorSize(Src, 512, false)) {

41813

SExtVT = MVT::v64i8;

41814

break;

41815

}

41816

return SDValue();

41817

};

41818

41819

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

41820

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

41821

41822

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

41823

V = getPMOVMSKB(DL, V, DAG, Subtarget);

41824

} else {

41825

if (SExtVT == MVT::v8i16)

41826

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

41827

DAG.getUNDEF(MVT::v8i16));

41828

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

41829

}

41830

41831

EVT IntVT =

41832

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

41833

V = DAG.getZExtOrTrunc(V, DL, IntVT);

41834

return DAG.getBitcast(VT, V);

41835

}

41836

41837

// Convert a vXi1 constant build vector to the same width scalar integer.

41838

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

41839

EVT SrcVT = Op.getValueType();

41840

assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41841, __extension__
__PRETTY_FUNCTION__))

41841

"Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41841, __extension__
__PRETTY_FUNCTION__));

41842

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41843, __extension__
__PRETTY_FUNCTION__))

41843

"Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41843, __extension__
__PRETTY_FUNCTION__));

41844

41845

APInt Imm(SrcVT.getVectorNumElements(), 0);

41846

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

41847

SDValue In = Op.getOperand(Idx);

41848

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

41849

Imm.setBit(Idx);

41850

}

41851

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

41852

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

41853

}

41854

41855

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

41856

TargetLowering::DAGCombinerInfo &DCI,

41857

const X86Subtarget &Subtarget) {

41858

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41858, __extension__
__PRETTY_FUNCTION__));

41859

41860

if (!DCI.isBeforeLegalizeOps())

41861

return SDValue();

41862

41863

// Only do this if we have k-registers.

41864

if (!Subtarget.hasAVX512())

41865

return SDValue();

41866

41867

EVT DstVT = N->getValueType(0);

41868

SDValue Op = N->getOperand(0);

41869

EVT SrcVT = Op.getValueType();

41870

41871

if (!Op.hasOneUse())

41872

return SDValue();

41873

41874

// Look for logic ops.

41875

if (Op.getOpcode() != ISD::AND &&

41876

Op.getOpcode() != ISD::OR &&

41877

Op.getOpcode() != ISD::XOR)

41878

return SDValue();

41879

41880

// Make sure we have a bitcast between mask registers and a scalar type.

41881

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

41882

DstVT.isScalarInteger()) &&

41883

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

41884

SrcVT.isScalarInteger()))

41885

return SDValue();

41886

41887

SDValue LHS = Op.getOperand(0);

41888

SDValue RHS = Op.getOperand(1);

41889

41890

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

41891

LHS.getOperand(0).getValueType() == DstVT)

41892

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

41893

DAG.getBitcast(DstVT, RHS));

41894

41895

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

41896

RHS.getOperand(0).getValueType() == DstVT)

41897

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

41898

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

41899

41900

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

41901

// Most of these have to move a constant from the scalar domain anyway.

41902

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

41903

RHS = combinevXi1ConstantToInteger(RHS, DAG);

41904

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

41905

DAG.getBitcast(DstVT, LHS), RHS);

41906

}

41907

41908

return SDValue();

41909

}

41910

41911

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

41912

const X86Subtarget &Subtarget) {

41913

SDLoc DL(BV);

41914

unsigned NumElts = BV->getNumOperands();

41915

SDValue Splat = BV->getSplatValue();

41916

41917

// Build MMX element from integer GPR or SSE float values.

41918

auto CreateMMXElement = [&](SDValue V) {

41919

if (V.isUndef())

41920

return DAG.getUNDEF(MVT::x86mmx);

41921

if (V.getValueType().isFloatingPoint()) {

41922

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

41923

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

41924

V = DAG.getBitcast(MVT::v2i64, V);

41925

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

41926

}

41927

V = DAG.getBitcast(MVT::i32, V);

41928

} else {

41929

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

41930

}

41931

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

41932

};

41933

41934

// Convert build vector ops to MMX data in the bottom elements.

41935

SmallVector<SDValue, 8> Ops;

41936

41937

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41938

41939

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

41940

if (Splat) {

41941

if (Splat.isUndef())

41942

return DAG.getUNDEF(MVT::x86mmx);

41943

41944

Splat = CreateMMXElement(Splat);

41945

41946

if (Subtarget.hasSSE1()) {

41947

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

41948

if (NumElts == 8)

41949

Splat = DAG.getNode(

41950

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

41951

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

41952

TLI.getPointerTy(DAG.getDataLayout())),

41953

Splat, Splat);

41954

41955

// Use PSHUFW to repeat 16-bit elements.

41956

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

41957

return DAG.getNode(

41958

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

41959

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

41960

TLI.getPointerTy(DAG.getDataLayout())),

41961

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

41962

}

41963

Ops.append(NumElts, Splat);

41964

} else {

41965

for (unsigned i = 0; i != NumElts; ++i)

41966

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

41967

}

41968

41969

// Use tree of PUNPCKLs to build up general MMX vector.

41970

while (Ops.size() > 1) {

41971

unsigned NumOps = Ops.size();

41972

unsigned IntrinOp =

41973

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

41974

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

41975

: Intrinsic::x86_mmx_punpcklbw));

41976

SDValue Intrin = DAG.getTargetConstant(

41977

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

41978

for (unsigned i = 0; i != NumOps; i += 2)

41979

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

41980

Ops[i], Ops[i + 1]);

41981

Ops.resize(NumOps / 2);

41982

}

41983

41984

return Ops[0];

41985

}

41986

41987

// Recursive function that attempts to find if a bool vector node was originally

41988

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

41989

// integer. If so, replace the scalar ops with bool vector equivalents back down

41990

// the chain.

41991

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

41992

SelectionDAG &DAG,

41993

const X86Subtarget &Subtarget) {

41994

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41995

unsigned Opc = V.getOpcode();

41996

switch (Opc) {

41997

case ISD::BITCAST: {

41998

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

41999

SDValue Src = V.getOperand(0);

42000

EVT SrcVT = Src.getValueType();

42001

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

42002

return DAG.getBitcast(VT, Src);

42003

break;

42004

}

42005

case ISD::TRUNCATE: {

42006

// If we find a suitable source, a truncated scalar becomes a subvector.

42007

SDValue Src = V.getOperand(0);

42008

EVT NewSrcVT =

42009

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

42010

if (TLI.isTypeLegal(NewSrcVT))

42011

if (SDValue N0 =

42012

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

42013

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

42014

DAG.getIntPtrConstant(0, DL));

42015

break;

42016

}

42017

case ISD::ANY_EXTEND:

42018

case ISD::ZERO_EXTEND: {

42019

// If we find a suitable source, an extended scalar becomes a subvector.

42020

SDValue Src = V.getOperand(0);

42021

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

42022

Src.getScalarValueSizeInBits());

42023

if (TLI.isTypeLegal(NewSrcVT))

42024

if (SDValue N0 =

42025

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

42026

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

42027

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

42028

: DAG.getConstant(0, DL, VT),

42029

N0, DAG.getIntPtrConstant(0, DL));

42030

break;

42031

}

42032

case ISD::OR: {

42033

// If we find suitable sources, we can just move an OR to the vector domain.

42034

SDValue Src0 = V.getOperand(0);

42035

SDValue Src1 = V.getOperand(1);

42036

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

42037

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

42038

return DAG.getNode(Opc, DL, VT, N0, N1);

42039

break;

42040

}

42041

case ISD::SHL: {

42042

// If we find a suitable source, a SHL becomes a KSHIFTL.

42043

SDValue Src0 = V.getOperand(0);

42044

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

42045

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

42046

break;

42047

42048

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

42049

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

42050

return DAG.getNode(

42051

X86ISD::KSHIFTL, DL, VT, N0,

42052

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

42053

break;

42054

}

42055

}

42056

return SDValue();

42057

}

42058

42059

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

42060

TargetLowering::DAGCombinerInfo &DCI,

42061

const X86Subtarget &Subtarget) {

42062

SDValue N0 = N->getOperand(0);

42063

EVT VT = N->getValueType(0);

42064

EVT SrcVT = N0.getValueType();

42065

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42066

42067

// Try to match patterns such as

42068

// (i16 bitcast (v16i1 x))

42069

// ->

42070

// (i16 movmsk (16i8 sext (v16i1 x)))

42071

// before the setcc result is scalarized on subtargets that don't have legal

42072

// vxi1 types.

42073

if (DCI.isBeforeLegalize()) {

42074

SDLoc dl(N);

42075

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

42076

return V;

42077

42078

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

42079

// type, widen both sides to avoid a trip through memory.

42080

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

42081

Subtarget.hasAVX512()) {

42082

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

42083

N0 = DAG.getBitcast(MVT::v8i1, N0);

42084

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

42085

DAG.getIntPtrConstant(0, dl));

42086

}

42087

42088

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

42089

// type, widen both sides to avoid a trip through memory.

42090

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

42091

Subtarget.hasAVX512()) {

42092

// Use zeros for the widening if we already have some zeroes. This can

42093

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

42094

// stream of this.

42095

// FIXME: It might make sense to detect a concat_vectors with a mix of

42096

// zeroes and undef and turn it into insert_subvector for i1 vectors as

42097

// a separate combine. What we can't do is canonicalize the operands of

42098

// such a concat or we'll get into a loop with SimplifyDemandedBits.

42099

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

42100

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

42101

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

42102

SrcVT = LastOp.getValueType();

42103

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

42104

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

42105

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

42106

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

42107

N0 = DAG.getBitcast(MVT::i8, N0);

42108

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

42109

}

42110

}

42111

42112

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

42113

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

42114

Ops[0] = N0;

42115

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

42116

N0 = DAG.getBitcast(MVT::i8, N0);

42117

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

42118

}

42119

} else {

42120

// If we're bitcasting from iX to vXi1, see if the integer originally

42121

// began as a vXi1 and whether we can remove the bitcast entirely.

42122

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

42123

SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

42124

if (SDValue V =

42125

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

42126

return V;

42127

}

42128

}

42129

42130

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

42131

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

42132

// due to insert_subvector legalization on KNL. By promoting the copy to i16

42133

// we can help with known bits propagation from the vXi1 domain to the

42134

// scalar domain.

42135

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

42136

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

42137

N0.getOperand(0).getValueType() == MVT::v16i1 &&

42138

isNullConstant(N0.getOperand(1)))

42139

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

42140

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

42141

42142

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

42143

// and the vbroadcast_load are both integer or both fp. In some cases this

42144

// will remove the bitcast entirely.

42145

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

42146

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

42147

auto *BCast = cast<MemIntrinsicSDNode>(N0);

42148

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

42149

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

42150

// Don't swap i8/i16 since don't have fp types that size.

42151

if (MemSize >= 32) {

42152

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

42153

: MVT::getIntegerVT(MemSize);

42154

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

42155

: MVT::getIntegerVT(SrcVTSize);

42156

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

42157

42158

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

42159

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

42160

SDValue ResNode =

42161

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

42162

MemVT, BCast->getMemOperand());

42163

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

42164

return DAG.getBitcast(VT, ResNode);

42165

}

42166

}

42167

42168

// Since MMX types are special and don't usually play with other vector types,

42169

// it's better to handle them early to be sure we emit efficient code by

42170

// avoiding store-load conversions.

42171

if (VT == MVT::x86mmx) {

42172

// Detect MMX constant vectors.

42173

APInt UndefElts;

42174

SmallVector<APInt, 1> EltBits;

42175

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

42176

SDLoc DL(N0);

42177

// Handle zero-extension of i32 with MOVD.

42178

if (EltBits[0].countLeadingZeros() >= 32)

42179

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

42180

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

42181

// Else, bitcast to a double.

42182

// TODO - investigate supporting sext 32-bit immediates on x86_64.

42183

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

42184

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

42185

}

42186

42187

// Detect bitcasts to x86mmx low word.

42188

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

42189

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

42190

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

42191

bool LowUndef = true, AllUndefOrZero = true;

42192

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

42193

SDValue Op = N0.getOperand(i);

42194

LowUndef &= Op.isUndef() || (i >= e/2);

42195

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

42196

}

42197

if (AllUndefOrZero) {

42198

SDValue N00 = N0.getOperand(0);

42199

SDLoc dl(N00);

42200

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

42201

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

42202

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

42203

}

42204

}

42205

42206

// Detect bitcasts of 64-bit build vectors and convert to a

42207

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

42208

// lowest element.

42209

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

42210

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

42211

SrcVT == MVT::v8i8))

42212

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

42213

42214

// Detect bitcasts between element or subvector extraction to x86mmx.

42215

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

42216

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

42217

isNullConstant(N0.getOperand(1))) {

42218

SDValue N00 = N0.getOperand(0);

42219

if (N00.getValueType().is128BitVector())

42220

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

42221

DAG.getBitcast(MVT::v2i64, N00));

42222

}

42223

42224

// Detect bitcasts from FP_TO_SINT to x86mmx.

42225

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

42226

SDLoc DL(N0);

42227

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

42228

DAG.getUNDEF(MVT::v2i32));

42229

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

42230

DAG.getBitcast(MVT::v2i64, Res));

42231

}

42232

}

42233

42234

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

42235

// most of these to scalar anyway.

42236

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

42237

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

42238

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

42239

return combinevXi1ConstantToInteger(N0, DAG);

42240

}

42241

42242

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

42243

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

42244

isa<ConstantSDNode>(N0)) {

42245

auto *C = cast<ConstantSDNode>(N0);

42246

if (C->isAllOnes())

42247

return DAG.getConstant(1, SDLoc(N0), VT);

42248

if (C->isZero())

42249

return DAG.getConstant(0, SDLoc(N0), VT);

42250

}

42251

42252

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

42253

// Turn it into a sign bit compare that produces a k-register. This avoids

42254

// a trip through a GPR.

42255

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

42256

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

42257

isPowerOf2_32(VT.getVectorNumElements())) {

42258

unsigned NumElts = VT.getVectorNumElements();

42259

SDValue Src = N0;

42260

42261

// Peek through truncate.

42262

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

42263

Src = N0.getOperand(0);

42264

42265

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

42266

SDValue MovmskIn = Src.getOperand(0);

42267

MVT MovmskVT = MovmskIn.getSimpleValueType();

42268

unsigned MovMskElts = MovmskVT.getVectorNumElements();

42269

42270

// We allow extra bits of the movmsk to be used since they are known zero.

42271

// We can't convert a VPMOVMSKB without avx512bw.

42272

if (MovMskElts <= NumElts &&

42273

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

42274

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

42275

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

42276

SDLoc dl(N);

42277

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

42278

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

42279

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

42280

if (EVT(CmpVT) == VT)

42281

return Cmp;

42282

42283

// Pad with zeroes up to original VT to replace the zeroes that were

42284

// being used from the MOVMSK.

42285

unsigned NumConcats = NumElts / MovMskElts;

42286

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

42287

Ops[0] = Cmp;

42288

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

42289

}

42290

}

42291

}

42292

42293

// Try to remove bitcasts from input and output of mask arithmetic to

42294

// remove GPR<->K-register crossings.

42295

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

42296

return V;

42297

42298

// Convert a bitcasted integer logic operation that has one bitcasted

42299

// floating-point operand into a floating-point logic operation. This may

42300

// create a load of a constant, but that is cheaper than materializing the

42301

// constant in an integer register and transferring it to an SSE register or

42302

// transferring the SSE operand to integer register and back.

42303

unsigned FPOpcode;

42304

switch (N0.getOpcode()) {

42305

case ISD::AND: FPOpcode = X86ISD::FAND; break;

42306

case ISD::OR: FPOpcode = X86ISD::FOR; break;

42307

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

42308

default: return SDValue();

42309

}

42310

42311

// Check if we have a bitcast from another integer type as well.

42312

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

42313

(Subtarget.hasSSE2() && VT == MVT::f64) ||

42314

(Subtarget.hasFP16() && VT == MVT::f16) ||

42315

(Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

42316

TLI.isTypeLegal(VT))))

42317

return SDValue();

42318

42319

SDValue LogicOp0 = N0.getOperand(0);

42320

SDValue LogicOp1 = N0.getOperand(1);

42321

SDLoc DL0(N0);

42322

42323

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

42324

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

42325

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

42326

LogicOp0.getOperand(0).getValueType() == VT &&

42327

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

42328

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

42329

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

42330

return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

42331

}

42332

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

42333

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

42334

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

42335

LogicOp1.getOperand(0).getValueType() == VT &&

42336

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

42337

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

42338

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

42339

return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

42340

}

42341

42342

return SDValue();

42343

}

42344

42345

// (mul (zext a), (sext, b))

42346

static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

42347

SDValue &Op1) {

42348

Op0 = Mul.getOperand(0);

42349

Op1 = Mul.getOperand(1);

42350

42351

// The operand1 should be signed extend

42352

if (Op0.getOpcode() == ISD::SIGN_EXTEND)

42353

std::swap(Op0, Op1);

42354

42355

auto IsFreeTruncation = [](SDValue &Op) -> bool {

42356

if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

42357

Op.getOpcode() == ISD::SIGN_EXTEND) &&

42358

Op.getOperand(0).getScalarValueSizeInBits() <= 8)

42359

return true;

42360

42361

auto *BV = dyn_cast<BuildVectorSDNode>(Op);

42362

return (BV && BV->isConstant());

42363

};

42364

42365

// (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

42366

// value, we need to check Op0 is zero extended value. Op1 should be signed

42367

// value, so we just check the signed bits.

42368

if ((IsFreeTruncation(Op0) &&

42369

DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

42370

(IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

42371

return true;

42372

42373

return false;

42374

}

42375

42376

// Given a ABS node, detect the following pattern:

42377

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

42378

// This is useful as it is the input into a SAD pattern.

42379

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

42380

SDValue AbsOp1 = Abs->getOperand(0);

42381

if (AbsOp1.getOpcode() != ISD::SUB)

42382

return false;

42383

42384

Op0 = AbsOp1.getOperand(0);

42385

Op1 = AbsOp1.getOperand(1);

42386

42387

// Check if the operands of the sub are zero-extended from vectors of i8.

42388

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

42389

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

42390

Op1.getOpcode() != ISD::ZERO_EXTEND ||

42391

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

42392

return false;

42393

42394

return true;

42395

}

42396

42397

static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

42398

unsigned &LogBias, const SDLoc &DL,

42399

const X86Subtarget &Subtarget) {

42400

// Extend or truncate to MVT::i8 first.

42401

MVT Vi8VT =

42402

MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

42403

LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

42404

RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);

42405

42406

// VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

42407

// C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

42408

// The src A, B element type is i8, but the dst C element type is i32.

42409

// When we calculate the reduce stage, we use src vector type vXi8 for it

42410

// so we need logbias 2 to avoid extra 2 stages.

42411

LogBias = 2;

42412

42413

unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

42414

if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

42415

RegSize = std::max(512u, RegSize);

42416

42417

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

42418

// fill in the missing vector elements with 0.

42419

unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

42420

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

42421

Ops[0] = LHS;

42422

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

42423

SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

42424

Ops[0] = RHS;

42425

SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

42426

42427

// Actually build the DotProduct, split as 256/512 bits for

42428

// AVXVNNI/AVX512VNNI.

42429

auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

42430

ArrayRef<SDValue> Ops) {

42431

MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

42432

return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

42433

};

42434

MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

42435

SDValue Zero = DAG.getConstant(0, DL, DpVT);

42436

42437

return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

42438

DpBuilder, false);

42439

}

42440

42441

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

42442

// to these zexts.

42443

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

42444

const SDValue &Zext1, const SDLoc &DL,

42445

const X86Subtarget &Subtarget) {

42446

// Find the appropriate width for the PSADBW.

42447

EVT InVT = Zext0.getOperand(0).getValueType();

42448

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

42449

42450

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

42451

// fill in the missing vector elements with 0.

42452

unsigned NumConcat = RegSize / InVT.getSizeInBits();

42453

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

42454

Ops[0] = Zext0.getOperand(0);

42455

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

42456

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

42457

Ops[0] = Zext1.getOperand(0);

42458

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

42459

42460

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

42461

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

42462

ArrayRef<SDValue> Ops) {

42463

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

42464

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

42465

};

42466

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

42467

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

42468

PSADBWBuilder);

42469

}

42470

42471

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

42472

// PHMINPOSUW.

42473

static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

42474

const X86Subtarget &Subtarget) {

42475

// Bail without SSE41.

42476

if (!Subtarget.hasSSE41())

42477

return SDValue();

42478

42479

EVT ExtractVT = Extract->getValueType(0);

42480

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

42481

return SDValue();

42482

42483

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

42484

ISD::NodeType BinOp;

42485

SDValue Src = DAG.matchBinOpReduction(

42486

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

42487

if (!Src)

42488

return SDValue();

42489

42490

EVT SrcVT = Src.getValueType();

42491

EVT SrcSVT = SrcVT.getScalarType();

42492

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

42493

return SDValue();

42494

42495

SDLoc DL(Extract);

42496

SDValue MinPos = Src;

42497

42498

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

42499

while (SrcVT.getSizeInBits() > 128) {

42500

SDValue Lo, Hi;

42501

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

42502

SrcVT = Lo.getValueType();

42503

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

42504

}

42505

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))

42506

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__))

42507

"Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42507, __extension__
__PRETTY_FUNCTION__));

42508

42509

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

42510

// to flip the value accordingly.

42511

SDValue Mask;

42512

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

42513

if (BinOp == ISD::SMAX)

42514

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

42515

else if (BinOp == ISD::SMIN)

42516

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

42517

else if (BinOp == ISD::UMAX)

42518

Mask = DAG.getAllOnesConstant(DL, SrcVT);

42519

42520

if (Mask)

42521

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

42522

42523

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

42524

// shuffling each upper element down and insert zeros. This means that the

42525

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

42526

// ready for the PHMINPOS.

42527

if (ExtractVT == MVT::i8) {

42528

SDValue Upper = DAG.getVectorShuffle(

42529

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

42530

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

42531

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

42532

}

42533

42534

// Perform the PHMINPOS on a v8i16 vector,

42535

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

42536

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

42537

MinPos = DAG.getBitcast(SrcVT, MinPos);

42538

42539

if (Mask)

42540

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

42541

42542

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

42543

DAG.getIntPtrConstant(0, DL));

42544

}

42545

42546

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

42547

static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

42548

const X86Subtarget &Subtarget) {

42549

// Bail without SSE2.

42550

if (!Subtarget.hasSSE2())

42551

return SDValue();

42552

42553

EVT ExtractVT = Extract->getValueType(0);

42554

unsigned BitWidth = ExtractVT.getSizeInBits();

42555

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

42556

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

42557

return SDValue();

42558

42559

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

42560

ISD::NodeType BinOp;

42561

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

42562

if (!Match && ExtractVT == MVT::i1)

42563

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

42564

if (!Match)

42565

return SDValue();

42566

42567

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

42568

// which we can't support here for now.

42569

if (Match.getScalarValueSizeInBits() != BitWidth)

42570

return SDValue();

42571

42572

SDValue Movmsk;

42573

SDLoc DL(Extract);

42574

EVT MatchVT = Match.getValueType();

42575

unsigned NumElts = MatchVT.getVectorNumElements();

42576

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

42577

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42578

42579

if (ExtractVT == MVT::i1) {

42580

// Special case for (pre-legalization) vXi1 reductions.

42581

if (NumElts > 64 || !isPowerOf2_32(NumElts))

42582

return SDValue();

42583

if (TLI.isTypeLegal(MatchVT)) {

42584

// If this is a legal AVX512 predicate type then we can just bitcast.

42585

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

42586

Movmsk = DAG.getBitcast(MovmskVT, Match);

42587

} else {

42588

// For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).

42589

if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&

42590

cast<CondCodeSDNode>(Match.getOperand(2))->get() ==

42591

ISD::CondCode::SETEQ) {

42592

EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();

42593

if (VecSVT != MVT::i8) {

42594

NumElts *= VecSVT.getSizeInBits() / 8;

42595

EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);

42596

MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

42597

Match = DAG.getSetCC(

42598

DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),

42599

DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);

42600

}

42601

}

42602

42603

// Use combineBitcastvxi1 to create the MOVMSK.

42604

while (NumElts > MaxElts) {

42605

SDValue Lo, Hi;

42606

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

42607

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

42608

NumElts /= 2;

42609

}

42610

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

42611

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

42612

}

42613

if (!Movmsk)

42614

return SDValue();

42615

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

42616

} else {

42617

// FIXME: Better handling of k-registers or 512-bit vectors?

42618

unsigned MatchSizeInBits = Match.getValueSizeInBits();

42619

if (!(MatchSizeInBits == 128 ||

42620

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

42621

return SDValue();

42622

42623

// Make sure this isn't a vector of 1 element. The perf win from using

42624

// MOVMSK diminishes with less elements in the reduction, but it is

42625

// generally better to get the comparison over to the GPRs as soon as

42626

// possible to reduce the number of vector ops.

42627

if (Match.getValueType().getVectorNumElements() < 2)

42628

return SDValue();

42629

42630

// Check that we are extracting a reduction of all sign bits.

42631

if (DAG.ComputeNumSignBits(Match) != BitWidth)

42632

return SDValue();

42633

42634

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

42635

SDValue Lo, Hi;

42636

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

42637

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

42638

MatchSizeInBits = Match.getValueSizeInBits();

42639

}

42640

42641

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

42642

MVT MaskSrcVT;

42643

if (64 == BitWidth || 32 == BitWidth)

42644

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

42645

MatchSizeInBits / BitWidth);

42646

else

42647

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

42648

42649

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

42650

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

42651

NumElts = MaskSrcVT.getVectorNumElements();

42652

}

42653

assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42654, __extension__
__PRETTY_FUNCTION__))

42654

"Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42654, __extension__
__PRETTY_FUNCTION__));

42655

42656

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

42657

if (BinOp == ISD::XOR) {

42658

// parity -> (PARITY(MOVMSK X))

42659

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

42660

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

42661

}

42662

42663

SDValue CmpC;

42664

ISD::CondCode CondCode;

42665

if (BinOp == ISD::OR) {

42666

// any_of -> MOVMSK != 0

42667

CmpC = DAG.getConstant(0, DL, CmpVT);

42668

CondCode = ISD::CondCode::SETNE;

42669

} else {

42670

// all_of -> MOVMSK == ((1 << NumElts) - 1)

42671

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

42672

DL, CmpVT);

42673

CondCode = ISD::CondCode::SETEQ;

42674

}

42675

42676

// The setcc produces an i8 of 0/1, so extend that to the result width and

42677

// negate to get the final 0/-1 mask value.

42678

EVT SetccVT =

42679

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);

42680

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

42681

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

42682

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

42683

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

42684

}

42685

42686

static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

42687

const X86Subtarget &Subtarget) {

42688

if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

42689

return SDValue();

42690

42691

EVT ExtractVT = Extract->getValueType(0);

42692

// Verify the type we're extracting is i32, as the output element type of

42693

// vpdpbusd is i32.

42694

if (ExtractVT != MVT::i32)

42695

return SDValue();

42696

42697

EVT VT = Extract->getOperand(0).getValueType();

42698

if (!isPowerOf2_32(VT.getVectorNumElements()))

42699

return SDValue();

42700

42701

// Match shuffle + add pyramid.

42702

ISD::NodeType BinOp;

42703

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

42704

42705

// We can't combine to vpdpbusd for zext, because each of the 4 multiplies

42706

// done by vpdpbusd compute a signed 16-bit product that will be sign extended

42707

// before adding into the accumulator.

42708

// TODO:

42709

// We also need to verify that the multiply has at least 2x the number of bits

42710

// of the input. We shouldn't match

42711

// (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

42712

// if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

42713

// Root = Root.getOperand(0);

42714

42715

// If there was a match, we want Root to be a mul.

42716

if (!Root || Root.getOpcode() != ISD::MUL)

42717

return SDValue();

42718

42719

// Check whether we have an extend and mul pattern

42720

SDValue LHS, RHS;

42721

if (!detectExtMul(DAG, Root, LHS, RHS))

42722

return SDValue();

42723

42724

// Create the dot product instruction.

42725

SDLoc DL(Extract);

42726

unsigned StageBias;

42727

SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);

42728

42729

// If the original vector was wider than 4 elements, sum over the results

42730

// in the DP vector.

42731

unsigned Stages = Log2_32(VT.getVectorNumElements());

42732

EVT DpVT = DP.getValueType();

42733

42734

if (Stages > StageBias) {

42735

unsigned DpElems = DpVT.getVectorNumElements();

42736

42737

for (unsigned i = Stages - StageBias; i > 0; --i) {

42738

SmallVector<int, 16> Mask(DpElems, -1);

42739

for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

42740

Mask[j] = MaskEnd + j;

42741

42742

SDValue Shuffle =

42743

DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

42744

DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

42745

}

42746

}

42747

42748

// Return the lowest ExtractSizeInBits bits.

42749

EVT ResVT =

42750

EVT::getVectorVT(*DAG.getContext(), ExtractVT,

42751

DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

42752

DP = DAG.getBitcast(ResVT, DP);

42753

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

42754

Extract->getOperand(1));

42755

}

42756

42757

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

42758

const X86Subtarget &Subtarget) {

42759

// PSADBW is only supported on SSE2 and up.

42760

if (!Subtarget.hasSSE2())

42761

return SDValue();

42762

42763

EVT ExtractVT = Extract->getValueType(0);

42764

// Verify the type we're extracting is either i32 or i64.

42765

// FIXME: Could support other types, but this is what we have coverage for.

42766

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

42767

return SDValue();

42768

42769

EVT VT = Extract->getOperand(0).getValueType();

42770

if (!isPowerOf2_32(VT.getVectorNumElements()))

42771

return SDValue();

42772

42773

// Match shuffle + add pyramid.

42774

ISD::NodeType BinOp;

42775

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

42776

42777

// The operand is expected to be zero extended from i8

42778

// (verified in detectZextAbsDiff).

42779

// In order to convert to i64 and above, additional any/zero/sign

42780

// extend is expected.

42781

// The zero extend from 32 bit has no mathematical effect on the result.

42782

// Also the sign extend is basically zero extend

42783

// (extends the sign bit which is zero).

42784

// So it is correct to skip the sign/zero extend instruction.

42785

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

42786

Root.getOpcode() == ISD::ZERO_EXTEND ||

42787

Root.getOpcode() == ISD::ANY_EXTEND))

42788

Root = Root.getOperand(0);

42789

42790

// If there was a match, we want Root to be a select that is the root of an

42791

// abs-diff pattern.

42792

if (!Root || Root.getOpcode() != ISD::ABS)

42793

return SDValue();

42794

42795

// Check whether we have an abs-diff pattern feeding into the select.

42796

SDValue Zext0, Zext1;

42797

if (!detectZextAbsDiff(Root, Zext0, Zext1))

42798

return SDValue();

42799

42800

// Create the SAD instruction.

42801

SDLoc DL(Extract);

42802

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

42803

42804

// If the original vector was wider than 8 elements, sum over the results

42805

// in the SAD vector.

42806

unsigned Stages = Log2_32(VT.getVectorNumElements());

42807

EVT SadVT = SAD.getValueType();

42808

if (Stages > 3) {

42809

unsigned SadElems = SadVT.getVectorNumElements();

42810

42811

for(unsigned i = Stages - 3; i > 0; --i) {

42812

SmallVector<int, 16> Mask(SadElems, -1);

42813

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

42814

Mask[j] = MaskEnd + j;

42815

42816

SDValue Shuffle =

42817

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

42818

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

42819

}

42820

}

42821

42822

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

42823

// Return the lowest ExtractSizeInBits bits.

42824

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

42825

SadVT.getSizeInBits() / ExtractSizeInBits);

42826

SAD = DAG.getBitcast(ResVT, SAD);

42827

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

42828

Extract->getOperand(1));

42829

}

42830

42831

// Attempt to peek through a target shuffle and extract the scalar from the

42832

// source.

42833

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

42834

TargetLowering::DAGCombinerInfo &DCI,

42835

const X86Subtarget &Subtarget) {

42836

if (DCI.isBeforeLegalizeOps())

42837

return SDValue();

42838

42839

SDLoc dl(N);

42840

SDValue Src = N->getOperand(0);

42841

SDValue Idx = N->getOperand(1);

42842

42843

EVT VT = N->getValueType(0);

42844

EVT SrcVT = Src.getValueType();

42845

EVT SrcSVT = SrcVT.getVectorElementType();

42846

unsigned SrcEltBits = SrcSVT.getSizeInBits();

42847

unsigned NumSrcElts = SrcVT.getVectorNumElements();

42848

42849

// Don't attempt this for boolean mask vectors or unknown extraction indices.

42850

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

42851

return SDValue();

42852

42853

const APInt &IdxC = N->getConstantOperandAPInt(1);

42854

if (IdxC.uge(NumSrcElts))

42855

return SDValue();

42856

42857

SDValue SrcBC = peekThroughBitcasts(Src);

42858

42859

// Handle extract(bitcast(broadcast(scalar_value))).

42860

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

42861

SDValue SrcOp = SrcBC.getOperand(0);

42862

EVT SrcOpVT = SrcOp.getValueType();

42863

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

42864

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

42865

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

42866

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

42867

// TODO support non-zero offsets.

42868

if (Offset == 0) {

42869

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

42870

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

42871

return SrcOp;

42872

}

42873

}

42874

}

42875

42876

// If we're extracting a single element from a broadcast load and there are

42877

// no other users, just create a single load.

42878

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

42879

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

42880

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

42881

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

42882

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

42883

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

42884

MemIntr->getBasePtr(),

42885

MemIntr->getPointerInfo(),

42886

MemIntr->getOriginalAlign(),

42887

MemIntr->getMemOperand()->getFlags());

42888

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

42889

return Load;

42890

}

42891

}

42892

42893

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

42894

// TODO: Move to DAGCombine?

42895

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

42896

SrcBC.getValueType().isInteger() &&

42897

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

42898

SrcBC.getScalarValueSizeInBits() ==

42899

SrcBC.getOperand(0).getValueSizeInBits()) {

42900

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

42901

if (IdxC.ult(Scale)) {

42902

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

42903

SDValue Scl = SrcBC.getOperand(0);

42904

EVT SclVT = Scl.getValueType();

42905

if (Offset) {

42906

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

42907

DAG.getShiftAmountConstant(Offset, SclVT, dl));

42908

}

42909

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

42910

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

42911

return Scl;

42912

}

42913

}

42914

42915

// Handle extract(truncate(x)) for 0'th index.

42916

// TODO: Treat this as a faux shuffle?

42917

// TODO: When can we use this for general indices?

42918

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

42919

(SrcVT.getSizeInBits() % 128) == 0) {

42920

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

42921

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

42922

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

42923

Idx);

42924

}

42925

42926

// We can only legally extract other elements from 128-bit vectors and in

42927

// certain circumstances, depending on SSE-level.

42928

// TODO: Investigate float/double extraction if it will be just stored.

42929

auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

42930

unsigned Idx) {

42931

EVT VecSVT = VecVT.getScalarType();

42932

if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

42933

(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

42934

VecSVT == MVT::i64)) {

42935

unsigned EltSizeInBits = VecSVT.getSizeInBits();

42936

unsigned NumEltsPerLane = 128 / EltSizeInBits;

42937

unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

42938

unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

42939

VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

42940

Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

42941

Idx &= (NumEltsPerLane - 1);

42942

}

42943

if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

42944

((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

42945

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

42946

DAG.getBitcast(VecVT, Vec),

42947

DAG.getIntPtrConstant(Idx, dl));

42948

}

42949

if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

42950

(VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

42951

unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

42952

return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

42953

DAG.getTargetConstant(Idx, dl, MVT::i8));

42954

}

42955

return SDValue();

42956

};

42957

42958

// Resolve the target shuffle inputs and mask.

42959

SmallVector<int, 16> Mask;

42960

SmallVector<SDValue, 2> Ops;

42961

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

42962

return SDValue();

42963

42964

// Shuffle inputs must be the same size as the result.

42965

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

42966

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

42967

}))

42968

return SDValue();

42969

42970

// Attempt to narrow/widen the shuffle mask to the correct size.

42971

if (Mask.size() != NumSrcElts) {

42972

if ((NumSrcElts % Mask.size()) == 0) {

42973

SmallVector<int, 16> ScaledMask;

42974

int Scale = NumSrcElts / Mask.size();

42975

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

42976

Mask = std::move(ScaledMask);

42977

} else if ((Mask.size() % NumSrcElts) == 0) {

42978

// Simplify Mask based on demanded element.

42979

int ExtractIdx = (int)IdxC.getZExtValue();

42980

int Scale = Mask.size() / NumSrcElts;

42981

int Lo = Scale * ExtractIdx;

42982

int Hi = Scale * (ExtractIdx + 1);

42983

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

42984

if (i < Lo || Hi <= i)

42985

Mask[i] = SM_SentinelUndef;

42986

42987

SmallVector<int, 16> WidenedMask;

42988

while (Mask.size() > NumSrcElts &&

42989

canWidenShuffleElements(Mask, WidenedMask))

42990

Mask = std::move(WidenedMask);

42991

}

42992

}

42993

42994

// If narrowing/widening failed, see if we can extract+zero-extend.

42995

int ExtractIdx;

42996

EVT ExtractVT;

42997

if (Mask.size() == NumSrcElts) {

42998

ExtractIdx = Mask[IdxC.getZExtValue()];

42999

ExtractVT = SrcVT;

43000

} else {

43001

unsigned Scale = Mask.size() / NumSrcElts;

43002

if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

43003

return SDValue();

43004

unsigned ScaledIdx = Scale * IdxC.getZExtValue();

43005

if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

43006

return SDValue();

43007

ExtractIdx = Mask[ScaledIdx];

43008

EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

43009

ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

43010

assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43011, __extension__
__PRETTY_FUNCTION__))

43011

"Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43011, __extension__
__PRETTY_FUNCTION__));

43012

}

43013

43014

// If the shuffle source element is undef/zero then we can just accept it.

43015

if (ExtractIdx == SM_SentinelUndef)

43016

return DAG.getUNDEF(VT);

43017

43018

if (ExtractIdx == SM_SentinelZero)

43019

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

43020

: DAG.getConstant(0, dl, VT);

43021

43022

SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

43023

ExtractIdx = ExtractIdx % Mask.size();

43024

if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

43025

return DAG.getZExtOrTrunc(V, dl, VT);

43026

43027

return SDValue();

43028

}

43029

43030

/// Extracting a scalar FP value from vector element 0 is free, so extract each

43031

/// operand first, then perform the math as a scalar op.

43032

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

43033

const X86Subtarget &Subtarget) {

43034

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43034, __extension__
__PRETTY_FUNCTION__));

43035

SDValue Vec = ExtElt->getOperand(0);

43036

SDValue Index = ExtElt->getOperand(1);

43037

EVT VT = ExtElt->getValueType(0);

43038

EVT VecVT = Vec.getValueType();

43039

43040

// TODO: If this is a unary/expensive/expand op, allow extraction from a

43041

// non-zero element because the shuffle+scalar op will be cheaper?

43042

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

43043

return SDValue();

43044

43045

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

43046

// extract, the condition code), so deal with those as a special-case.

43047

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

43048

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

43049

if (OpVT != MVT::f32 && OpVT != MVT::f64)

43050

return SDValue();

43051

43052

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

43053

SDLoc DL(ExtElt);

43054

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

43055

Vec.getOperand(0), Index);

43056

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

43057

Vec.getOperand(1), Index);

43058

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

43059

}

43060

43061

if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

43062

VT != MVT::f64)

43063

return SDValue();

43064

43065

// Vector FP selects don't fit the pattern of FP math ops (because the

43066

// condition has a different type and we have to change the opcode), so deal

43067

// with those here.

43068

// FIXME: This is restricted to pre type legalization by ensuring the setcc

43069

// has i1 elements. If we loosen this we need to convert vector bool to a

43070

// scalar bool.

43071

if (Vec.getOpcode() == ISD::VSELECT &&

43072

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

43073

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

43074

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

43075

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

43076

SDLoc DL(ExtElt);

43077

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

43078

Vec.getOperand(0).getValueType().getScalarType(),

43079

Vec.getOperand(0), Index);

43080

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

43081

Vec.getOperand(1), Index);

43082

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

43083

Vec.getOperand(2), Index);

43084

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

43085

}

43086

43087

// TODO: This switch could include FNEG and the x86-specific FP logic ops

43088

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

43089

// missed load folding and fma+fneg combining.

43090

switch (Vec.getOpcode()) {

43091

case ISD::FMA: // Begin 3 operands

43092

case ISD::FMAD:

43093

case ISD::FADD: // Begin 2 operands

43094

case ISD::FSUB:

43095

case ISD::FMUL:

43096

case ISD::FDIV:

43097

case ISD::FREM:

43098

case ISD::FCOPYSIGN:

43099

case ISD::FMINNUM:

43100

case ISD::FMAXNUM:

43101

case ISD::FMINNUM_IEEE:

43102

case ISD::FMAXNUM_IEEE:

43103

case ISD::FMAXIMUM:

43104

case ISD::FMINIMUM:

43105

case X86ISD::FMAX:

43106

case X86ISD::FMIN:

43107

case ISD::FABS: // Begin 1 operand

43108

case ISD::FSQRT:

43109

case ISD::FRINT:

43110

case ISD::FCEIL:

43111

case ISD::FTRUNC:

43112

case ISD::FNEARBYINT:

43113

case ISD::FROUND:

43114

case ISD::FFLOOR:

43115

case X86ISD::FRCP:

43116

case X86ISD::FRSQRT: {

43117

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

43118

SDLoc DL(ExtElt);

43119

SmallVector<SDValue, 4> ExtOps;

43120

for (SDValue Op : Vec->ops())

43121

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

43122

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

43123

}

43124

default:

43125

return SDValue();

43126

}

43127

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43127);

43128

}

43129

43130

/// Try to convert a vector reduction sequence composed of binops and shuffles

43131

/// into horizontal ops.

43132

static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

43133

const X86Subtarget &Subtarget) {

43134

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43134, __extension__
__PRETTY_FUNCTION__));

43135

43136

// We need at least SSE2 to anything here.

43137

if (!Subtarget.hasSSE2())

43138

return SDValue();

43139

43140

ISD::NodeType Opc;

43141

SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

43142

{ISD::ADD, ISD::MUL, ISD::FADD}, true);

43143

if (!Rdx)

43144

return SDValue();

43145

43146

SDValue Index = ExtElt->getOperand(1);

43147

assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43148, __extension__
__PRETTY_FUNCTION__))

43148

"Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43148, __extension__
__PRETTY_FUNCTION__));

43149

43150

EVT VT = ExtElt->getValueType(0);

43151

EVT VecVT = Rdx.getValueType();

43152

if (VecVT.getScalarType() != VT)

43153

return SDValue();

43154

43155

SDLoc DL(ExtElt);

43156

unsigned NumElts = VecVT.getVectorNumElements();

43157

unsigned EltSizeInBits = VecVT.getScalarSizeInBits();

43158

43159

// Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

43160

auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

43161

if (V.getValueType() == MVT::v4i8) {

43162

if (ZeroExtend && Subtarget.hasSSE41()) {

43163

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

43164

DAG.getConstant(0, DL, MVT::v4i32),

43165

DAG.getBitcast(MVT::i32, V),

43166

DAG.getIntPtrConstant(0, DL));

43167

return DAG.getBitcast(MVT::v16i8, V);

43168

}

43169

V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

43170

ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

43171

: DAG.getUNDEF(MVT::v4i8));

43172

}

43173

return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

43174

DAG.getUNDEF(MVT::v8i8));

43175

};

43176

43177

// vXi8 mul reduction - promote to vXi16 mul reduction.

43178

if (Opc == ISD::MUL) {

43179

if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

43180

return SDValue();

43181

if (VecVT.getSizeInBits() >= 128) {

43182

EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

43183

SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

43184

SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

43185

Lo = DAG.getBitcast(WideVT, Lo);

43186

Hi = DAG.getBitcast(WideVT, Hi);

43187

Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

43188

while (Rdx.getValueSizeInBits() > 128) {

43189

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

43190

Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

43191

}

43192

} else {

43193

Rdx = WidenToV16I8(Rdx, false);

43194

Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

43195

Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

43196

}

43197

if (NumElts >= 8)

43198

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

43199

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

43200

{4, 5, 6, 7, -1, -1, -1, -1}));

43201

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

43202

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

43203

{2, 3, -1, -1, -1, -1, -1, -1}));

43204

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

43205

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

43206

{1, -1, -1, -1, -1, -1, -1, -1}));

43207

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

43208

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

43209

}

43210

43211

// vXi8 add reduction - sub 128-bit vector.

43212

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

43213

Rdx = WidenToV16I8(Rdx, true);

43214

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

43215

DAG.getConstant(0, DL, MVT::v16i8));

43216

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

43217

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

43218

}

43219

43220

// Must be a >=128-bit vector with pow2 elements.

43221

if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

43222

return SDValue();

43223

43224

// vXi8 add reduction - sum lo/hi halves then use PSADBW.

43225

if (VT == MVT::i8) {

43226

while (Rdx.getValueSizeInBits() > 128) {

43227

SDValue Lo, Hi;

43228

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

43229

VecVT = Lo.getValueType();

43230

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

43231

}

43232

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43232, __extension__
__PRETTY_FUNCTION__));

43233

43234

SDValue Hi = DAG.getVectorShuffle(

43235

MVT::v16i8, DL, Rdx, Rdx,

43236

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

43237

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

43238

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

43239

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

43240

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

43241

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

43242

}

43243

43244

// See if we can use vXi8 PSADBW add reduction for larger zext types.

43245

// If the source vector values are 0-255, then we can use PSADBW to

43246

// sum+zext v8i8 subvectors to vXi64, then perform the reduction.

43247

// TODO: See if its worth avoiding vXi16/i32 truncations?

43248

if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

43249

DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

43250

(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

43251

Subtarget.hasAVX512())) {

43252

EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

43253

Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

43254

if (ByteVT.getSizeInBits() < 128)

43255

Rdx = WidenToV16I8(Rdx, true);

43256

43257

// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

43258

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

43259

ArrayRef<SDValue> Ops) {

43260

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

43261

SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

43262

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

43263

};

43264

MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

43265

Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);

43266

43267

// TODO: We could truncate to vXi16/vXi32 before performing the reduction.

43268

while (Rdx.getValueSizeInBits() > 128) {

43269

SDValue Lo, Hi;

43270

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

43271

VecVT = Lo.getValueType();

43272

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

43273

}

43274

assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43274, __extension__
__PRETTY_FUNCTION__));

43275

43276

if (NumElts > 8) {

43277

SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

43278

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

43279

}

43280

43281

VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

43282

Rdx = DAG.getBitcast(VecVT, Rdx);

43283

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

43284

}

43285

43286

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

43287

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

43288

return SDValue();

43289

43290

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

43291

43292

// 256-bit horizontal instructions operate on 128-bit chunks rather than

43293

// across the whole vector, so we need an extract + hop preliminary stage.

43294

// This is the only step where the operands of the hop are not the same value.

43295

// TODO: We could extend this to handle 512-bit or even longer vectors.

43296

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

43297

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

43298

unsigned NumElts = VecVT.getVectorNumElements();

43299

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

43300

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

43301

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

43302

VecVT = Rdx.getValueType();

43303

}

43304

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

43305

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

43306

return SDValue();

43307

43308

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

43309

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

43310

for (unsigned i = 0; i != ReductionSteps; ++i)

43311

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

43312

43313

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

43314

}

43315

43316

/// Detect vector gather/scatter index generation and convert it from being a

43317

/// bunch of shuffles and extracts into a somewhat faster sequence.

43318

/// For i686, the best sequence is apparently storing the value and loading

43319

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

43320

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

43321

TargetLowering::DAGCombinerInfo &DCI,

43322

const X86Subtarget &Subtarget) {

43323

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

43324

return NewOp;

43325

43326

SDValue InputVector = N->getOperand(0);

43327

SDValue EltIdx = N->getOperand(1);

43328

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

43329

43330

EVT SrcVT = InputVector.getValueType();

43331

EVT VT = N->getValueType(0);

43332

SDLoc dl(InputVector);

43333

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

43334

unsigned NumSrcElts = SrcVT.getVectorNumElements();

43335

43336

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

43337

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

43338

43339

// Integer Constant Folding.

43340

if (CIdx && VT.isInteger()) {

43341

APInt UndefVecElts;

43342

SmallVector<APInt, 16> EltBits;

43343

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

43344

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

43345

EltBits, true, false)) {

43346

uint64_t Idx = CIdx->getZExtValue();

43347

if (UndefVecElts[Idx])

43348

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

43349

return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),

43350

dl, VT);

43351

}

43352

}

43353

43354

if (IsPextr) {

43355

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43356

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

43357

APInt::getAllOnes(VT.getSizeInBits()), DCI))

43358

return SDValue(N, 0);

43359

43360

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

43361

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

43362

InputVector.getOpcode() == X86ISD::PINSRW) &&

43363

InputVector.getOperand(2) == EltIdx) {

43364

assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43365, __extension__
__PRETTY_FUNCTION__))

43365

"Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43365, __extension__
__PRETTY_FUNCTION__));

43366

SDValue Scl = InputVector.getOperand(1);

43367

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

43368

return DAG.getZExtOrTrunc(Scl, dl, VT);

43369

}

43370

43371

// TODO - Remove this once we can handle the implicit zero-extension of

43372

// X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

43373

// combineBasicSADPattern.

43374

return SDValue();

43375

}

43376

43377

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

43378

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

43379

VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

43380

SDValue MMXSrc = InputVector.getOperand(0);

43381

43382

// The bitcast source is a direct mmx result.

43383

if (MMXSrc.getValueType() == MVT::x86mmx)

43384

return DAG.getBitcast(VT, InputVector);

43385

}

43386

43387

// Detect mmx to i32 conversion through a v2i32 elt extract.

43388

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

43389

VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {

43390

SDValue MMXSrc = InputVector.getOperand(0);

43391

43392

// The bitcast source is a direct mmx result.

43393

if (MMXSrc.getValueType() == MVT::x86mmx)

43394

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);

43395

}

43396

43397

// Check whether this extract is the root of a sum of absolute differences

43398

// pattern. This has to be done here because we really want it to happen

43399

// pre-legalization,

43400

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

43401

return SAD;

43402

43403

if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

43404

return VPDPBUSD;

43405

43406

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

43407

if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

43408

return Cmp;

43409

43410

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

43411

if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

43412

return MinMax;

43413

43414

// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

43415

if (SDValue V = combineArithReduction(N, DAG, Subtarget))

43416

return V;

43417

43418

if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))

43419

return V;

43420

43421

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

43422

// and then testing the relevant element.

43423

//

43424

// Note that we only combine extracts on the *same* result number, i.e.

43425

// t0 = merge_values a0, a1, a2, a3

43426

// i1 = extract_vector_elt t0, Constant:i64<2>

43427

// i1 = extract_vector_elt t0, Constant:i64<3>

43428

// but not

43429

// i1 = extract_vector_elt t0:1, Constant:i64<2>

43430

// since the latter would need its own MOVMSK.

43431

if (SrcVT.getScalarType() == MVT::i1) {

43432

bool IsVar = !CIdx;

43433

SmallVector<SDNode *, 16> BoolExtracts;

43434

unsigned ResNo = InputVector.getResNo();

43435

auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

43436

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

43437

Use->getOperand(0).getResNo() == ResNo &&

43438

Use->getValueType(0) == MVT::i1) {

43439

BoolExtracts.push_back(Use);

43440

IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

43441

return true;

43442

}

43443

return false;

43444

};

43445

// TODO: Can we drop the oneuse check for constant extracts?

43446

if (all_of(InputVector->uses(), IsBoolExtract) &&

43447

(IsVar || BoolExtracts.size() > 1)) {

43448

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

43449

if (SDValue BC =

43450

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

43451

for (SDNode *Use : BoolExtracts) {

43452

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

43453

// Mask = 1 << MaskIdx

43454

SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

43455

SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

43456

SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

43457

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

43458

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

43459

DCI.CombineTo(Use, Res);

43460

}

43461

return SDValue(N, 0);

43462

}

43463

}

43464

}

43465

43466

// If this extract is from a loaded vector value and will be used as an

43467

// integer, that requires a potentially expensive XMM -> GPR transfer.

43468

// Additionally, if we can convert to a scalar integer load, that will likely

43469

// be folded into a subsequent integer op.

43470

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

43471

// to a single-use of the loaded vector. For the reasons above, we

43472

// expect this to be profitable even if it creates an extra load.

43473

bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {

43474

return Use->getOpcode() == ISD::STORE ||

43475

Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

43476

Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

43477

});

43478

auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);

43479

if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

43480

SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&

43481

!LikelyUsedAsVector) {

43482

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43483

SDValue NewPtr =

43484

TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);

43485

unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;

43486

MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

43487

Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

43488

SDValue Load =

43489

DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

43490

LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

43491

DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

43492

return Load;

43493

}

43494

43495

return SDValue();

43496

}

43497

43498

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

43499

// This is more or less the reverse of combineBitcastvxi1.

43500

static SDValue combineToExtendBoolVectorInReg(

43501

unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

43502

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

43503

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

43504

Opcode != ISD::ANY_EXTEND)

43505

return SDValue();

43506

if (!DCI.isBeforeLegalizeOps())

43507

return SDValue();

43508

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

43509

return SDValue();

43510

43511

EVT SVT = VT.getScalarType();

43512

EVT InSVT = N0.getValueType().getScalarType();

43513

unsigned EltSizeInBits = SVT.getSizeInBits();

43514

43515

// Input type must be extending a bool vector (bit-casted from a scalar

43516

// integer) to legal integer types.

43517

if (!VT.isVector())

43518

return SDValue();

43519

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

43520

return SDValue();

43521

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

43522

return SDValue();

43523

43524

SDValue N00 = N0.getOperand(0);

43525

EVT SclVT = N00.getValueType();

43526

if (!SclVT.isScalarInteger())

43527

return SDValue();

43528

43529

SDValue Vec;

43530

SmallVector<int> ShuffleMask;

43531

unsigned NumElts = VT.getVectorNumElements();

43532

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__));

43533

43534

// Broadcast the scalar integer to the vector elements.

43535

if (NumElts > EltSizeInBits) {

43536

// If the scalar integer is greater than the vector element size, then we

43537

// must split it down into sub-sections for broadcasting. For example:

43538

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

43539

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

43540

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43540, __extension__
__PRETTY_FUNCTION__));

43541

unsigned Scale = NumElts / EltSizeInBits;

43542

EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

43543

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

43544

Vec = DAG.getBitcast(VT, Vec);

43545

43546

for (unsigned i = 0; i != Scale; ++i)

43547

ShuffleMask.append(EltSizeInBits, i);

43548

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

43549

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

43550

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

43551

// If we have register broadcast instructions, use the scalar size as the

43552

// element type for the shuffle. Then cast to the wider element type. The

43553

// widened bits won't be used, and this might allow the use of a broadcast

43554

// load.

43555

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43555, __extension__
__PRETTY_FUNCTION__));

43556

unsigned Scale = EltSizeInBits / NumElts;

43557

EVT BroadcastVT =

43558

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

43559

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

43560

ShuffleMask.append(NumElts * Scale, 0);

43561

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

43562

Vec = DAG.getBitcast(VT, Vec);

43563

} else {

43564

// For smaller scalar integers, we can simply any-extend it to the vector

43565

// element size (we don't care about the upper bits) and broadcast it to all

43566

// elements.

43567

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

43568

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

43569

ShuffleMask.append(NumElts, 0);

43570

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

43571

}

43572

43573

// Now, mask the relevant bit in each element.

43574

SmallVector<SDValue, 32> Bits;

43575

for (unsigned i = 0; i != NumElts; ++i) {

43576

int BitIdx = (i % EltSizeInBits);

43577

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

43578

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

43579

}

43580

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

43581

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

43582

43583

// Compare against the bitmask and extend the result.

43584

EVT CCVT = VT.changeVectorElementType(MVT::i1);

43585

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

43586

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

43587

43588

// For SEXT, this is now done, otherwise shift the result down for

43589

// zero-extension.

43590

if (Opcode == ISD::SIGN_EXTEND)

43591

return Vec;

43592

return DAG.getNode(ISD::SRL, DL, VT, Vec,

43593

DAG.getConstant(EltSizeInBits - 1, DL, VT));

43594

}

43595

43596

/// If a vector select has an operand that is -1 or 0, try to simplify the

43597

/// select to a bitwise logic operation.

43598

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

43599

static SDValue

43600

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

43601

TargetLowering::DAGCombinerInfo &DCI,

43602

const X86Subtarget &Subtarget) {

43603

SDValue Cond = N->getOperand(0);

43604

SDValue LHS = N->getOperand(1);

43605

SDValue RHS = N->getOperand(2);

43606

EVT VT = LHS.getValueType();

43607

EVT CondVT = Cond.getValueType();

43608

SDLoc DL(N);

43609

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43610

43611

if (N->getOpcode() != ISD::VSELECT)

43612

return SDValue();

43613

43614

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43614, __extension__
__PRETTY_FUNCTION__));

43615

43616

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

43617

// TODO: Can we assert that both operands are not zeros (because that should

43618

// get simplified at node creation time)?

43619

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

43620

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

43621

43622

// If both inputs are 0/undef, create a complete zero vector.

43623

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

43624

if (TValIsAllZeros && FValIsAllZeros) {

43625

if (VT.isFloatingPoint())

43626

return DAG.getConstantFP(0.0, DL, VT);

43627

return DAG.getConstant(0, DL, VT);

43628

}

43629

43630

// To use the condition operand as a bitwise mask, it must have elements that

43631

// are the same size as the select elements. Ie, the condition operand must

43632

// have already been promoted from the IR select condition type <N x i1>.

43633

// Don't check if the types themselves are equal because that excludes

43634

// vector floating-point selects.

43635

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

43636

return SDValue();

43637

43638

// Try to invert the condition if true value is not all 1s and false value is

43639

// not all 0s. Only do this if the condition has one use.

43640

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

43641

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

43642

// Check if the selector will be produced by CMPP*/PCMP*.

43643

Cond.getOpcode() == ISD::SETCC &&

43644

// Check if SETCC has already been promoted.

43645

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

43646

CondVT) {

43647

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

43648

43649

if (TValIsAllZeros || FValIsAllOnes) {

43650

SDValue CC = Cond.getOperand(2);

43651

ISD::CondCode NewCC = ISD::getSetCCInverse(

43652

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

43653

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

43654

NewCC);

43655

std::swap(LHS, RHS);

43656

TValIsAllOnes = FValIsAllOnes;

43657

FValIsAllZeros = TValIsAllZeros;

43658

}

43659

}

43660

43661

// Cond value must be 'sign splat' to be converted to a logical op.

43662

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

43663

return SDValue();

43664

43665

// vselect Cond, 111..., 000... -> Cond

43666

if (TValIsAllOnes && FValIsAllZeros)

43667

return DAG.getBitcast(VT, Cond);

43668

43669

if (!TLI.isTypeLegal(CondVT))

43670

return SDValue();

43671

43672

// vselect Cond, 111..., X -> or Cond, X

43673

if (TValIsAllOnes) {

43674

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

43675

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

43676

return DAG.getBitcast(VT, Or);

43677

}

43678

43679

// vselect Cond, X, 000... -> and Cond, X

43680

if (FValIsAllZeros) {

43681

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

43682

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

43683

return DAG.getBitcast(VT, And);

43684

}

43685

43686

// vselect Cond, 000..., X -> andn Cond, X

43687

if (TValIsAllZeros) {

43688

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

43689

SDValue AndN;

43690

// The canonical form differs for i1 vectors - x86andnp is not used

43691

if (CondVT.getScalarType() == MVT::i1)

43692

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

43693

CastRHS);

43694

else

43695

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

43696

return DAG.getBitcast(VT, AndN);

43697

}

43698

43699

return SDValue();

43700

}

43701

43702

/// If both arms of a vector select are concatenated vectors, split the select,

43703

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

43704

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

43705

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

43706

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

43707

const X86Subtarget &Subtarget) {

43708

unsigned Opcode = N->getOpcode();

43709

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

43710

return SDValue();

43711

43712

// TODO: Split 512-bit vectors too?

43713

EVT VT = N->getValueType(0);

43714

if (!VT.is256BitVector())

43715

return SDValue();

43716

43717

// TODO: Split as long as any 2 of the 3 operands are concatenated?

43718

SDValue Cond = N->getOperand(0);

43719

SDValue TVal = N->getOperand(1);

43720

SDValue FVal = N->getOperand(2);

43721

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

43722

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

43723

!collectConcatOps(TVal.getNode(), CatOpsT) ||

43724

!collectConcatOps(FVal.getNode(), CatOpsF))

43725

return SDValue();

43726

43727

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

43728

ArrayRef<SDValue> Ops) {

43729

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

43730

};

43731

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

43732

makeBlend, /*CheckBWI*/ false);

43733

}

43734

43735

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

43736

SDValue Cond = N->getOperand(0);

43737

SDValue LHS = N->getOperand(1);

43738

SDValue RHS = N->getOperand(2);

43739

SDLoc DL(N);

43740

43741

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

43742

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

43743

if (!TrueC || !FalseC)

43744

return SDValue();

43745

43746

// Don't do this for crazy integer types.

43747

EVT VT = N->getValueType(0);

43748

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

43749

return SDValue();

43750

43751

// We're going to use the condition bit in math or logic ops. We could allow

43752

// this with a wider condition value (post-legalization it becomes an i8),

43753

// but if nothing is creating selects that late, it doesn't matter.

43754

if (Cond.getValueType() != MVT::i1)

43755

return SDValue();

43756

43757

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

43758

// 3, 5, or 9 with i32/i64, so those get transformed too.

43759

// TODO: For constants that overflow or do not differ by power-of-2 or small

43760

// multiplier, convert to 'and' + 'add'.

43761

const APInt &TrueVal = TrueC->getAPIntValue();

43762

const APInt &FalseVal = FalseC->getAPIntValue();

43763

43764

// We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

43765

if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

43766

Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

43767

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

43768

if (CC == ISD::SETEQ || CC == ISD::SETNE)

43769

return SDValue();

43770

}

43771

43772

bool OV;

43773

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

43774

if (OV)

43775

return SDValue();

43776

43777

APInt AbsDiff = Diff.abs();

43778

if (AbsDiff.isPowerOf2() ||

43779

((VT == MVT::i32 || VT == MVT::i64) &&

43780

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

43781

43782

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

43783

// of the condition can usually be folded into a compare predicate, but even

43784

// without that, the sequence should be cheaper than a CMOV alternative.

43785

if (TrueVal.slt(FalseVal)) {

43786

Cond = DAG.getNOT(DL, Cond, MVT::i1);

43787

std::swap(TrueC, FalseC);

43788

}

43789

43790

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

43791

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

43792

43793

// Multiply condition by the difference if non-one.

43794

if (!AbsDiff.isOne())

43795

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

43796

43797

// Add the base if non-zero.

43798

if (!FalseC->isZero())

43799

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

43800

43801

return R;

43802

}

43803

43804

return SDValue();

43805

}

43806

43807

/// If this is a *dynamic* select (non-constant condition) and we can match

43808

/// this node with one of the variable blend instructions, restructure the

43809

/// condition so that blends can use the high (sign) bit of each element.

43810

/// This function will also call SimplifyDemandedBits on already created

43811

/// BLENDV to perform additional simplifications.

43812

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

43813

TargetLowering::DAGCombinerInfo &DCI,

43814

const X86Subtarget &Subtarget) {

43815

SDValue Cond = N->getOperand(0);

43816

if ((N->getOpcode() != ISD::VSELECT &&

43817

N->getOpcode() != X86ISD::BLENDV) ||

43818

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

43819

return SDValue();

43820

43821

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43822

unsigned BitWidth = Cond.getScalarValueSizeInBits();

43823

EVT VT = N->getValueType(0);

43824

43825

// We can only handle the cases where VSELECT is directly legal on the

43826

// subtarget. We custom lower VSELECT nodes with constant conditions and

43827

// this makes it hard to see whether a dynamic VSELECT will correctly

43828

// lower, so we both check the operation's status and explicitly handle the

43829

// cases where a *dynamic* blend will fail even though a constant-condition

43830

// blend could be custom lowered.

43831

// FIXME: We should find a better way to handle this class of problems.

43832

// Potentially, we should combine constant-condition vselect nodes

43833

// pre-legalization into shuffles and not mark as many types as custom

43834

// lowered.

43835

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

43836

return SDValue();

43837

// FIXME: We don't support i16-element blends currently. We could and

43838

// should support them by making *all* the bits in the condition be set

43839

// rather than just the high bit and using an i8-element blend.

43840

if (VT.getVectorElementType() == MVT::i16)

43841

return SDValue();

43842

// Dynamic blending was only available from SSE4.1 onward.

43843

if (VT.is128BitVector() && !Subtarget.hasSSE41())

43844

return SDValue();

43845

// Byte blends are only available in AVX2

43846

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

43847

return SDValue();

43848

// There are no 512-bit blend instructions that use sign bits.

43849

if (VT.is512BitVector())

43850

return SDValue();

43851

43852

// Don't optimize before the condition has been transformed to a legal type

43853

// and don't ever optimize vector selects that map to AVX512 mask-registers.

43854

if (BitWidth < 8 || BitWidth > 64)

43855

return SDValue();

43856

43857

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

43858

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

43859

UI != UE; ++UI)

43860

if ((UI->getOpcode() != ISD::VSELECT &&

43861

UI->getOpcode() != X86ISD::BLENDV) ||

43862

UI.getOperandNo() != 0)

43863

return false;

43864

43865

return true;

43866

};

43867

43868

APInt DemandedBits(APInt::getSignMask(BitWidth));

43869

43870

if (OnlyUsedAsSelectCond(Cond)) {

43871

KnownBits Known;

43872

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

43873

!DCI.isBeforeLegalizeOps());

43874

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

43875

return SDValue();

43876

43877

// If we changed the computation somewhere in the DAG, this change will

43878

// affect all users of Cond. Update all the nodes so that we do not use

43879

// the generic VSELECT anymore. Otherwise, we may perform wrong

43880

// optimizations as we messed with the actual expectation for the vector

43881

// boolean values.

43882

for (SDNode *U : Cond->uses()) {

43883

if (U->getOpcode() == X86ISD::BLENDV)

43884

continue;

43885

43886

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

43887

Cond, U->getOperand(1), U->getOperand(2));

43888

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

43889

DCI.AddToWorklist(U);

43890

}

43891

DCI.CommitTargetLoweringOpt(TLO);

43892

return SDValue(N, 0);

43893

}

43894

43895

// Otherwise we can still at least try to simplify multiple use bits.

43896

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

43897

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

43898

N->getOperand(1), N->getOperand(2));

43899

43900

return SDValue();

43901

}

43902

43903

// Try to match:

43904

// (or (and (M, (sub 0, X)), (pandn M, X)))

43905

// which is a special case of:

43906

// (select M, (sub 0, X), X)

43907

// Per:

43908

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

43909

// We know that, if fNegate is 0 or 1:

43910

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

43911

//

43912

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

43913

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

43914

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

43915

// This lets us transform our vselect to:

43916

// (add (xor X, M), (and M, 1))

43917

// And further to:

43918

// (sub (xor X, M), M)

43919

static SDValue combineLogicBlendIntoConditionalNegate(

43920

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

43921

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

43922

EVT MaskVT = Mask.getValueType();

43923

assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))

43924

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__))

43925

"Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43925, __extension__
__PRETTY_FUNCTION__));

43926

43927

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

43928

return SDValue();

43929

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

43930

return SDValue();

43931

43932

auto IsNegV = [](SDNode *N, SDValue V) {

43933

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

43934

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

43935

};

43936

43937

SDValue V;

43938

if (IsNegV(Y.getNode(), X))

43939

V = X;

43940

else if (IsNegV(X.getNode(), Y))

43941

V = Y;

43942

else

43943

return SDValue();

43944

43945

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

43946

SDValue SubOp2 = Mask;

43947

43948

// If the negate was on the false side of the select, then

43949

// the operands of the SUB need to be swapped. PR 27251.

43950

// This is because the pattern being matched above is

43951

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

43952

// but if the pattern matched was

43953

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

43954

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

43955

// pattern also needs to be a negation of the replacement pattern above.

43956

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

43957

// sub accomplishes the negation of the replacement pattern.

43958

if (V == Y)

43959

std::swap(SubOp1, SubOp2);

43960

43961

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

43962

return DAG.getBitcast(VT, Res);

43963

}

43964

43965

/// Do target-specific dag combines on SELECT and VSELECT nodes.

43966

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

43967

TargetLowering::DAGCombinerInfo &DCI,

43968

const X86Subtarget &Subtarget) {

43969

SDLoc DL(N);

43970

SDValue Cond = N->getOperand(0);

43971

SDValue LHS = N->getOperand(1);

43972

SDValue RHS = N->getOperand(2);

43973

43974

// Try simplification again because we use this function to optimize

43975

// BLENDV nodes that are not handled by the generic combiner.

43976

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

43977

return V;

43978

43979

EVT VT = LHS.getValueType();

43980

EVT CondVT = Cond.getValueType();

43981

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43982

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

43983

43984

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

43985

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

43986

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

43987

if (CondVT.isVector() && CondVT.isInteger() &&

43988

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

43989

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

43990

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

43991

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

43992

DL, DAG, Subtarget))

43993

return V;

43994

43995

// Convert vselects with constant condition into shuffles.

43996

if (CondConstantVector && DCI.isBeforeLegalizeOps() &&

43997

(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {

43998

SmallVector<int, 64> Mask;

43999

if (createShuffleMaskFromVSELECT(Mask, Cond,

44000

N->getOpcode() == X86ISD::BLENDV))

44001

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

44002

}

44003

44004

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

44005

// by forcing the unselected elements to zero.

44006

// TODO: Can we handle more shuffles with this?

44007

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

44008

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

44009

LHS.hasOneUse() && RHS.hasOneUse()) {

44010

MVT SimpleVT = VT.getSimpleVT();

44011

SmallVector<SDValue, 1> LHSOps, RHSOps;

44012

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

44013

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

44014

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&

44015

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {

44016

int NumElts = VT.getVectorNumElements();

44017

for (int i = 0; i != NumElts; ++i) {

44018

// getConstVector sets negative shuffle mask values as undef, so ensure

44019

// we hardcode SM_SentinelZero values to zero (0x80).

44020

if (CondMask[i] < NumElts) {

44021

LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];

44022

RHSMask[i] = 0x80;

44023

} else {

44024

LHSMask[i] = 0x80;

44025

RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];

44026

}

44027

}

44028

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

44029

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

44030

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

44031

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

44032

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

44033

}

44034

}

44035

44036

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

44037

// instructions match the semantics of the common C idiom x<y?x:y but not

44038

// x<=y?x:y, because of how they handle negative zero (which can be

44039

// ignored in unsafe-math mode).

44040

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

44041

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

44042

VT != MVT::f80 && VT != MVT::f128 &&

44043

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

44044

(Subtarget.hasSSE2() ||

44045

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

44046

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

44047

44048

unsigned Opcode = 0;

44049

// Check for x CC y ? x : y.

44050

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

44051

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

44052

switch (CC) {

44053

default: break;

44054

case ISD::SETULT:

44055

// Converting this to a min would handle NaNs incorrectly, and swapping

44056

// the operands would cause it to handle comparisons between positive

44057

// and negative zero incorrectly.

44058

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

44059

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

44060

!(DAG.isKnownNeverZeroFloat(LHS) ||

44061

DAG.isKnownNeverZeroFloat(RHS)))

44062

break;

44063

std::swap(LHS, RHS);

44064

}

44065

Opcode = X86ISD::FMIN;

44066

break;

44067

case ISD::SETOLE:

44068

// Converting this to a min would handle comparisons between positive

44069

// and negative zero incorrectly.

44070

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

44071

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

44072

break;

44073

Opcode = X86ISD::FMIN;

44074

break;

44075

case ISD::SETULE:

44076

// Converting this to a min would handle both negative zeros and NaNs

44077

// incorrectly, but we can swap the operands to fix both.

44078

std::swap(LHS, RHS);

44079

LLVM_FALLTHROUGH[[gnu::fallthrough]];

44080

case ISD::SETOLT:

44081

case ISD::SETLT:

44082

case ISD::SETLE:

44083

Opcode = X86ISD::FMIN;

44084

break;

44085

44086

case ISD::SETOGE:

44087

// Converting this to a max would handle comparisons between positive

44088

// and negative zero incorrectly.

44089

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

44090

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

44091

break;

44092

Opcode = X86ISD::FMAX;

44093

break;

44094

case ISD::SETUGT:

44095

// Converting this to a max would handle NaNs incorrectly, and swapping

44096

// the operands would cause it to handle comparisons between positive

44097

// and negative zero incorrectly.

44098

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

44099

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

44100

!(DAG.isKnownNeverZeroFloat(LHS) ||

44101

DAG.isKnownNeverZeroFloat(RHS)))

44102

break;

44103

std::swap(LHS, RHS);

44104

}

44105

Opcode = X86ISD::FMAX;

44106

break;

44107

case ISD::SETUGE:

44108

// Converting this to a max would handle both negative zeros and NaNs

44109

// incorrectly, but we can swap the operands to fix both.

44110

std::swap(LHS, RHS);

44111

LLVM_FALLTHROUGH[[gnu::fallthrough]];

44112

case ISD::SETOGT:

44113

case ISD::SETGT:

44114

case ISD::SETGE:

44115

Opcode = X86ISD::FMAX;

44116

break;

44117

}

44118

// Check for x CC y ? y : x -- a min/max with reversed arms.

44119

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

44120

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

44121

switch (CC) {

44122

default: break;

44123

case ISD::SETOGE:

44124

// Converting this to a min would handle comparisons between positive

44125

// and negative zero incorrectly, and swapping the operands would

44126

// cause it to handle NaNs incorrectly.

44127

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

44128

!(DAG.isKnownNeverZeroFloat(LHS) ||

44129

DAG.isKnownNeverZeroFloat(RHS))) {

44130

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

44131

break;

44132

std::swap(LHS, RHS);

44133

}

44134

Opcode = X86ISD::FMIN;

44135

break;

44136

case ISD::SETUGT:

44137

// Converting this to a min would handle NaNs incorrectly.

44138

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

44139

break;

44140

Opcode = X86ISD::FMIN;

44141

break;

44142

case ISD::SETUGE:

44143

// Converting this to a min would handle both negative zeros and NaNs

44144

// incorrectly, but we can swap the operands to fix both.

44145

std::swap(LHS, RHS);

44146

LLVM_FALLTHROUGH[[gnu::fallthrough]];

44147

case ISD::SETOGT:

44148

case ISD::SETGT:

44149

case ISD::SETGE:

44150

Opcode = X86ISD::FMIN;

44151

break;

44152

44153

case ISD::SETULT:

44154

// Converting this to a max would handle NaNs incorrectly.

44155

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

44156

break;

44157

Opcode = X86ISD::FMAX;

44158

break;

44159

case ISD::SETOLE:

44160

// Converting this to a max would handle comparisons between positive

44161

// and negative zero incorrectly, and swapping the operands would

44162

// cause it to handle NaNs incorrectly.

44163

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

44164

!DAG.isKnownNeverZeroFloat(LHS) &&

44165

!DAG.isKnownNeverZeroFloat(RHS)) {

44166

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

44167

break;

44168

std::swap(LHS, RHS);

44169

}

44170

Opcode = X86ISD::FMAX;

44171

break;

44172

case ISD::SETULE:

44173

// Converting this to a max would handle both negative zeros and NaNs

44174

// incorrectly, but we can swap the operands to fix both.

44175

std::swap(LHS, RHS);

44176

LLVM_FALLTHROUGH[[gnu::fallthrough]];

44177

case ISD::SETOLT:

44178

case ISD::SETLT:

44179

case ISD::SETLE:

44180

Opcode = X86ISD::FMAX;

44181

break;

44182

}

44183

}

44184

44185

if (Opcode)

44186

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

44187

}

44188

44189

// Some mask scalar intrinsics rely on checking if only one bit is set

44190

// and implement it in C code like this:

44191

// A[0] = (U & 1) ? A[0] : W[0];

44192

// This creates some redundant instructions that break pattern matching.

44193

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

44194

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

44195

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

44196

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

44197

SDValue AndNode = Cond.getOperand(0);

44198

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

44199

isNullConstant(Cond.getOperand(1)) &&

44200

isOneConstant(AndNode.getOperand(1))) {

44201

// LHS and RHS swapped due to

44202

// setcc outputting 1 when AND resulted in 0 and vice versa.

44203

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

44204

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

44205

}

44206

}

44207

44208

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

44209

// lowering on KNL. In this case we convert it to

44210

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

44211

// The same situation all vectors of i8 and i16 without BWI.

44212

// Make sure we extend these even before type legalization gets a chance to

44213

// split wide vectors.

44214

// Since SKX these selects have a proper lowering.

44215

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

44216

CondVT.getVectorElementType() == MVT::i1 &&

44217

(VT.getVectorElementType() == MVT::i8 ||

44218

VT.getVectorElementType() == MVT::i16)) {

44219

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

44220

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

44221

}

44222

44223

// AVX512 - Extend select with zero to merge with target shuffle.

44224

// select(mask, extract_subvector(shuffle(x)), zero) -->

44225

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

44226

// TODO - support non target shuffles as well.

44227

if (Subtarget.hasAVX512() && CondVT.isVector() &&

44228

CondVT.getVectorElementType() == MVT::i1) {

44229

auto SelectableOp = [&TLI](SDValue Op) {

44230

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

44231

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

44232

isNullConstant(Op.getOperand(1)) &&

44233

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

44234

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

44235

};

44236

44237

bool SelectableLHS = SelectableOp(LHS);

44238

bool SelectableRHS = SelectableOp(RHS);

44239

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

44240

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

44241

44242

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

44243

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

44244

: RHS.getOperand(0).getValueType();

44245

EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

44246

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

44247

VT.getSizeInBits());

44248

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

44249

VT.getSizeInBits());

44250

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

44251

DAG.getUNDEF(SrcCondVT), Cond,

44252

DAG.getIntPtrConstant(0, DL));

44253

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

44254

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

44255

}

44256

}

44257

44258

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

44259

return V;

44260

44261

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

44262

Cond.hasOneUse()) {

44263

EVT CondVT = Cond.getValueType();

44264

SDValue Cond0 = Cond.getOperand(0);

44265

SDValue Cond1 = Cond.getOperand(1);

44266

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

44267

44268

// Canonicalize min/max:

44269

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

44270

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

44271

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

44272

// the need for an extra compare against zero. e.g.

44273

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

44274

// subl %esi, %edi

44275

// testl %edi, %edi

44276

// movl $0, %eax

44277

// cmovgl %edi, %eax

44278

// =>

44279

// xorl %eax, %eax

44280

// subl %esi, $edi

44281

// cmovsl %eax, %edi

44282

//

44283

// We can also canonicalize

44284

// (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

44285

// (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

44286

// This allows the use of a test instruction for the compare.

44287

if (LHS == Cond0 && RHS == Cond1) {

44288

if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

44289

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

44290

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

44291

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

44292

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

44293

}

44294

if (CC == ISD::SETUGT && isOneConstant(RHS)) {

44295

ISD::CondCode NewCC = ISD::SETUGE;

44296

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

44297

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

44298

}

44299

}

44300

44301

// Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

44302

// fold eq + gt/lt nested selects into ge/le selects

44303

// select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

44304

// --> (select (cmpuge Cond0, Cond1), LHS, Y)

44305

// select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

44306

// --> (select (cmpsle Cond0, Cond1), LHS, Y)

44307

// .. etc ..

44308

if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

44309

RHS.getOperand(0).getOpcode() == ISD::SETCC) {

44310

SDValue InnerSetCC = RHS.getOperand(0);

44311

ISD::CondCode InnerCC =

44312

cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

44313

if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

44314

Cond0 == InnerSetCC.getOperand(0) &&

44315

Cond1 == InnerSetCC.getOperand(1)) {

44316

ISD::CondCode NewCC;

44317

switch (CC == ISD::SETEQ ? InnerCC : CC) {

44318

case ISD::SETGT: NewCC = ISD::SETGE; break;

44319

case ISD::SETLT: NewCC = ISD::SETLE; break;

44320

case ISD::SETUGT: NewCC = ISD::SETUGE; break;

44321

case ISD::SETULT: NewCC = ISD::SETULE; break;

44322

default: NewCC = ISD::SETCC_INVALID; break;

44323

}

44324

if (NewCC != ISD::SETCC_INVALID) {

44325

Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

44326

return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

44327

}

44328

}

44329

}

44330

}

44331

44332

// Check if the first operand is all zeros and Cond type is vXi1.

44333

// If this an avx512 target we can improve the use of zero masking by

44334

// swapping the operands and inverting the condition.

44335

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

44336

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

44337

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

44338

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

44339

// Invert the cond to not(cond) : xor(op,allones)=not(op)

44340

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

44341

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

44342

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

44343

}

44344

44345

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

44346

// get split by legalization.

44347

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

44348

CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&

44349

TLI.isTypeLegal(VT.getScalarType())) {

44350

EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

44351

if (SDValue ExtCond = combineToExtendBoolVectorInReg(

44352

ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

44353

ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

44354

return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

44355

}

44356

}

44357

44358

// Early exit check

44359

if (!TLI.isTypeLegal(VT))

44360

return SDValue();

44361

44362

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

44363

return V;

44364

44365

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

44366

return V;

44367

44368

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

44369

return V;

44370

44371

// select(~Cond, X, Y) -> select(Cond, Y, X)

44372

if (CondVT.getScalarType() != MVT::i1) {

44373

if (SDValue CondNot = IsNOT(Cond, DAG))

44374

return DAG.getNode(N->getOpcode(), DL, VT,

44375

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

44376

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.

44377

if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&

44378

ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {

44379

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

44380

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

44381

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

44382

}

44383

}

44384

44385

// Try to optimize vXi1 selects if both operands are either all constants or

44386

// bitcasts from scalar integer type. In that case we can convert the operands

44387

// to integer and use an integer select which will be converted to a CMOV.

44388

// We need to take a little bit of care to avoid creating an i64 type after

44389

// type legalization.

44390

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

44391

VT.getVectorElementType() == MVT::i1 &&

44392

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

44393

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

44394

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

44395

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

44396

44397

if ((LHSIsConst ||

44398

(LHS.getOpcode() == ISD::BITCAST &&

44399

LHS.getOperand(0).getValueType() == IntVT)) &&

44400

(RHSIsConst ||

44401

(RHS.getOpcode() == ISD::BITCAST &&

44402

RHS.getOperand(0).getValueType() == IntVT))) {

44403

if (LHSIsConst)

44404

LHS = combinevXi1ConstantToInteger(LHS, DAG);

44405

else

44406

LHS = LHS.getOperand(0);

44407

44408

if (RHSIsConst)

44409

RHS = combinevXi1ConstantToInteger(RHS, DAG);

44410

else

44411

RHS = RHS.getOperand(0);

44412

44413

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

44414

return DAG.getBitcast(VT, Select);

44415

}

44416

}

44417

44418

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

44419

// single bits, then invert the predicate and swap the select operands.

44420

// This can lower using a vector shift bit-hack rather than mask and compare.

44421

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

44422

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

44423

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

44424

Cond.getOperand(0).getOpcode() == ISD::AND &&

44425

isNullOrNullSplat(Cond.getOperand(1)) &&

44426

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

44427

Cond.getOperand(0).getValueType() == VT) {

44428

// The 'and' mask must be composed of power-of-2 constants.

44429

SDValue And = Cond.getOperand(0);

44430

auto *C = isConstOrConstSplat(And.getOperand(1));

44431

if (C && C->getAPIntValue().isPowerOf2()) {

44432

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

44433

SDValue NotCond =

44434

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

44435

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

44436

}

44437

44438

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

44439

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

44440

// 16-bit lacks a proper blendv.

44441

unsigned EltBitWidth = VT.getScalarSizeInBits();

44442

bool CanShiftBlend =

44443

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

44444

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

44445

(Subtarget.hasXOP()));

44446

if (CanShiftBlend &&

44447

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

44448

return C->getAPIntValue().isPowerOf2();

44449

})) {

44450

// Create a left-shift constant to get the mask bits over to the sign-bit.

44451

SDValue Mask = And.getOperand(1);

44452

SmallVector<int, 32> ShlVals;

44453

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

44454

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

44455

ShlVals.push_back(EltBitWidth - 1 -

44456

MaskVal->getAPIntValue().exactLogBase2());

44457

}

44458

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

44459

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

44460

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

44461

SDValue NewCond =

44462

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

44463

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

44464

}

44465

}

44466

44467

return SDValue();

44468

}

44469

44470

/// Combine:

44471

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

44472

/// to:

44473

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

44474

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

44475

/// Note that this is only legal for some op/cc combinations.

44476

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

44477

SelectionDAG &DAG,

44478

const X86Subtarget &Subtarget) {

44479

// This combine only operates on CMP-like nodes.

44480

if (!(Cmp.getOpcode() == X86ISD::CMP ||

44481

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

44482

return SDValue();

44483

44484

// Can't replace the cmp if it has more uses than the one we're looking at.

44485

// FIXME: We would like to be able to handle this, but would need to make sure

44486

// all uses were updated.

44487

if (!Cmp.hasOneUse())

44488

return SDValue();

44489

44490

// This only applies to variations of the common case:

44491

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

44492

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

44493

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

44494

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

44495

// Using the proper condcodes (see below), overflow is checked for.

44496

44497

// FIXME: We can generalize both constraints:

44498

// - XOR/OR/AND (if they were made to survive AtomicExpand)

44499

// - LHS != 1

44500

// if the result is compared.

44501

44502

SDValue CmpLHS = Cmp.getOperand(0);

44503

SDValue CmpRHS = Cmp.getOperand(1);

44504

EVT CmpVT = CmpLHS.getValueType();

44505

44506

if (!CmpLHS.hasOneUse())

44507

return SDValue();

44508

44509

unsigned Opc = CmpLHS.getOpcode();

44510

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

44511

return SDValue();

44512

44513

SDValue OpRHS = CmpLHS.getOperand(2);

44514

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

44515

if (!OpRHSC)

44516

return SDValue();

44517

44518

APInt Addend = OpRHSC->getAPIntValue();

44519

if (Opc == ISD::ATOMIC_LOAD_SUB)

44520

Addend = -Addend;

44521

44522

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

44523

if (!CmpRHSC)

44524

return SDValue();

44525

44526

APInt Comparison = CmpRHSC->getAPIntValue();

44527

APInt NegAddend = -Addend;

44528

44529

// See if we can adjust the CC to make the comparison match the negated

44530

// addend.

44531

if (Comparison != NegAddend) {

44532

APInt IncComparison = Comparison + 1;

44533

if (IncComparison == NegAddend) {

44534

if (CC == X86::COND_A && !Comparison.isMaxValue()) {

44535

Comparison = IncComparison;

44536

CC = X86::COND_AE;

44537

} else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

44538

Comparison = IncComparison;

44539

CC = X86::COND_L;

44540

}

44541

}

44542

APInt DecComparison = Comparison - 1;

44543

if (DecComparison == NegAddend) {

44544

if (CC == X86::COND_AE && !Comparison.isMinValue()) {

44545

Comparison = DecComparison;

44546

CC = X86::COND_A;

44547

} else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

44548

Comparison = DecComparison;

44549

CC = X86::COND_LE;

44550

}

44551

}

44552

}

44553

44554

// If the addend is the negation of the comparison value, then we can do

44555

// a full comparison by emitting the atomic arithmetic as a locked sub.

44556

if (Comparison == NegAddend) {

44557

// The CC is fine, but we need to rewrite the LHS of the comparison as an

44558

// atomic sub.

44559

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

44560

auto AtomicSub = DAG.getAtomic(

44561

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

44562

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

44563

/*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

44564

AN->getMemOperand());

44565

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

44566

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

44567

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

44568

return LockOp;

44569

}

44570

44571

// We can handle comparisons with zero in a number of cases by manipulating

44572

// the CC used.

44573

if (!Comparison.isZero())

44574

return SDValue();

44575

44576

if (CC == X86::COND_S && Addend == 1)

44577

CC = X86::COND_LE;

44578

else if (CC == X86::COND_NS && Addend == 1)

44579

CC = X86::COND_G;

44580

else if (CC == X86::COND_G && Addend == -1)

44581

CC = X86::COND_GE;

44582

else if (CC == X86::COND_LE && Addend == -1)

44583

CC = X86::COND_L;

44584

else

44585

return SDValue();

44586

44587

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

44588

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

44589

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

44590

return LockOp;

44591

}

44592

44593

// Check whether a boolean test is testing a boolean value generated by

44594

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

44595

// code.

44596

//

44597

// Simplify the following patterns:

44598

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

44599

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

44600

// to (Op EFLAGS Cond)

44601

//

44602

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

44603

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

44604

// to (Op EFLAGS !Cond)

44605

//

44606

// where Op could be BRCOND or CMOV.

44607

//

44608

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

44609

// This combine only operates on CMP-like nodes.

44610

if (!(Cmp.getOpcode() == X86ISD::CMP ||

44611

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

44612

return SDValue();

44613

44614

// Quit if not used as a boolean value.

44615

if (CC != X86::COND_E && CC != X86::COND_NE)

44616

return SDValue();

44617

44618

// Check CMP operands. One of them should be 0 or 1 and the other should be

44619

// an SetCC or extended from it.

44620

SDValue Op1 = Cmp.getOperand(0);

44621

SDValue Op2 = Cmp.getOperand(1);

44622

44623

SDValue SetCC;

44624

const ConstantSDNode* C = nullptr;

44625

bool needOppositeCond = (CC == X86::COND_E);

44626

bool checkAgainstTrue = false; // Is it a comparison against 1?

44627

44628

if ((C = dyn_cast<ConstantSDNode>(Op1)))

44629

SetCC = Op2;

44630

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

44631

SetCC = Op1;

44632

else // Quit if all operands are not constants.

44633

return SDValue();

44634

44635

if (C->getZExtValue() == 1) {

44636

needOppositeCond = !needOppositeCond;

44637

checkAgainstTrue = true;

44638

} else if (C->getZExtValue() != 0)

44639

// Quit if the constant is neither 0 or 1.

44640

return SDValue();

44641

44642

bool truncatedToBoolWithAnd = false;

44643

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

44644

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

44645

SetCC.getOpcode() == ISD::TRUNCATE ||

44646

SetCC.getOpcode() == ISD::AND) {

44647

if (SetCC.getOpcode() == ISD::AND) {

44648

int OpIdx = -1;

44649

if (isOneConstant(SetCC.getOperand(0)))

44650

OpIdx = 1;

44651

if (isOneConstant(SetCC.getOperand(1)))

44652

OpIdx = 0;

44653

if (OpIdx < 0)

44654

break;

44655

SetCC = SetCC.getOperand(OpIdx);

44656

truncatedToBoolWithAnd = true;

44657

} else

44658

SetCC = SetCC.getOperand(0);

44659

}

44660

44661

switch (SetCC.getOpcode()) {

44662

case X86ISD::SETCC_CARRY:

44663

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

44664

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

44665

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

44666

// truncated to i1 using 'and'.

44667

if (checkAgainstTrue && !truncatedToBoolWithAnd)

44668

break;

44669

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44670, __extension__
__PRETTY_FUNCTION__))

44670

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44670, __extension__
__PRETTY_FUNCTION__));

44671

LLVM_FALLTHROUGH[[gnu::fallthrough]];

44672

case X86ISD::SETCC:

44673

// Set the condition code or opposite one if necessary.

44674

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

44675

if (needOppositeCond)

44676

CC = X86::GetOppositeBranchCondition(CC);

44677

return SetCC.getOperand(1);

44678

case X86ISD::CMOV: {

44679

// Check whether false/true value has canonical one, i.e. 0 or 1.

44680

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

44681

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

44682

// Quit if true value is not a constant.

44683

if (!TVal)

44684

return SDValue();

44685

// Quit if false value is not a constant.

44686

if (!FVal) {

44687

SDValue Op = SetCC.getOperand(0);

44688

// Skip 'zext' or 'trunc' node.

44689

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

44690

Op.getOpcode() == ISD::TRUNCATE)

44691

Op = Op.getOperand(0);

44692

// A special case for rdrand/rdseed, where 0 is set if false cond is

44693

// found.

44694

if ((Op.getOpcode() != X86ISD::RDRAND &&

44695

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

44696

return SDValue();

44697

}

44698

// Quit if false value is not the constant 0 or 1.

44699

bool FValIsFalse = true;

44700

if (FVal && FVal->getZExtValue() != 0) {

44701

if (FVal->getZExtValue() != 1)

44702

return SDValue();

44703

// If FVal is 1, opposite cond is needed.

44704

needOppositeCond = !needOppositeCond;

44705

FValIsFalse = false;

44706

}

44707

// Quit if TVal is not the constant opposite of FVal.

44708

if (FValIsFalse && TVal->getZExtValue() != 1)

44709

return SDValue();

44710

if (!FValIsFalse && TVal->getZExtValue() != 0)

44711

return SDValue();

44712

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

44713

if (needOppositeCond)

44714

CC = X86::GetOppositeBranchCondition(CC);

44715

return SetCC.getOperand(3);

44716

}

44717

}

44718

44719

return SDValue();

44720

}

44721

44722

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

44723

/// Match:

44724

/// (X86or (X86setcc) (X86setcc))

44725

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

44726

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

44727

X86::CondCode &CC1, SDValue &Flags,

44728

bool &isAnd) {

44729

if (Cond->getOpcode() == X86ISD::CMP) {

44730

if (!isNullConstant(Cond->getOperand(1)))

44731

return false;

44732

44733

Cond = Cond->getOperand(0);

44734

}

44735

44736

isAnd = false;

44737

44738

SDValue SetCC0, SetCC1;

44739

switch (Cond->getOpcode()) {

44740

default: return false;

44741

case ISD::AND:

44742

case X86ISD::AND:

44743

isAnd = true;

44744

LLVM_FALLTHROUGH[[gnu::fallthrough]];

44745

case ISD::OR:

44746

case X86ISD::OR:

44747

SetCC0 = Cond->getOperand(0);

44748

SetCC1 = Cond->getOperand(1);

44749

break;

44750

};

44751

44752

// Make sure we have SETCC nodes, using the same flags value.

44753

if (SetCC0.getOpcode() != X86ISD::SETCC ||

44754

SetCC1.getOpcode() != X86ISD::SETCC ||

44755

SetCC0->getOperand(1) != SetCC1->getOperand(1))

44756

return false;

44757

44758

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

44759

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

44760

Flags = SetCC0->getOperand(1);

44761

return true;

44762

}

44763

44764

// When legalizing carry, we create carries via add X, -1

44765

// If that comes from an actual carry, via setcc, we use the

44766

// carry directly.

44767

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

44768

if (EFLAGS.getOpcode() == X86ISD::ADD) {

44769

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

44770

bool FoundAndLSB = false;

44771

SDValue Carry = EFLAGS.getOperand(0);

44772

while (Carry.getOpcode() == ISD::TRUNCATE ||

44773

Carry.getOpcode() == ISD::ZERO_EXTEND ||

44774

(Carry.getOpcode() == ISD::AND &&

44775

isOneConstant(Carry.getOperand(1)))) {

44776

FoundAndLSB |= Carry.getOpcode() == ISD::AND;

44777

Carry = Carry.getOperand(0);

44778

}

44779

if (Carry.getOpcode() == X86ISD::SETCC ||

44780

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

44781

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

44782

uint64_t CarryCC = Carry.getConstantOperandVal(0);

44783

SDValue CarryOp1 = Carry.getOperand(1);

44784

if (CarryCC == X86::COND_B)

44785

return CarryOp1;

44786

if (CarryCC == X86::COND_A) {

44787

// Try to convert COND_A into COND_B in an attempt to facilitate

44788

// materializing "setb reg".

44789

//

44790

// Do not flip "e > c", where "c" is a constant, because Cmp

44791

// instruction cannot take an immediate as its first operand.

44792

//

44793

if (CarryOp1.getOpcode() == X86ISD::SUB &&

44794

CarryOp1.getNode()->hasOneUse() &&

44795

CarryOp1.getValueType().isInteger() &&

44796

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

44797

SDValue SubCommute =

44798

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

44799

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

44800

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

44801

}

44802

}

44803

// If this is a check of the z flag of an add with 1, switch to the

44804

// C flag.

44805

if (CarryCC == X86::COND_E &&

44806

CarryOp1.getOpcode() == X86ISD::ADD &&

44807

isOneConstant(CarryOp1.getOperand(1)))

44808

return CarryOp1;

44809

} else if (FoundAndLSB) {

44810

SDLoc DL(Carry);

44811

SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

44812

if (Carry.getOpcode() == ISD::SRL) {

44813

BitNo = Carry.getOperand(1);

44814

Carry = Carry.getOperand(0);

44815

}

44816

return getBT(Carry, BitNo, DL, DAG);

44817

}

44818

}

44819

}

44820

44821

return SDValue();

44822

}

44823

44824

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

44825

/// to avoid the inversion.

44826

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

44827

SelectionDAG &DAG,

44828

const X86Subtarget &Subtarget) {

44829

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

44830

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

44831

EFLAGS.getOpcode() != X86ISD::TESTP)

44832

return SDValue();

44833

44834

// PTEST/TESTP sets EFLAGS as:

44835

// TESTZ: ZF = (Op0 & Op1) == 0

44836

// TESTC: CF = (~Op0 & Op1) == 0

44837

// TESTNZC: ZF == 0 && CF == 0

44838

EVT VT = EFLAGS.getValueType();

44839

SDValue Op0 = EFLAGS.getOperand(0);

44840

SDValue Op1 = EFLAGS.getOperand(1);

44841

EVT OpVT = Op0.getValueType();

44842

44843

// TEST*(~X,Y) == TEST*(X,Y)

44844

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

44845

X86::CondCode InvCC;

44846

switch (CC) {

44847

case X86::COND_B:

44848

// testc -> testz.

44849

InvCC = X86::COND_E;

44850

break;

44851

case X86::COND_AE:

44852

// !testc -> !testz.

44853

InvCC = X86::COND_NE;

44854

break;

44855

case X86::COND_E:

44856

// testz -> testc.

44857

InvCC = X86::COND_B;

44858

break;

44859

case X86::COND_NE:

44860

// !testz -> !testc.

44861

InvCC = X86::COND_AE;

44862

break;

44863

case X86::COND_A:

44864

case X86::COND_BE:

44865

// testnzc -> testnzc (no change).

44866

InvCC = CC;

44867

break;

44868

default:

44869

InvCC = X86::COND_INVALID;

44870

break;

44871

}

44872

44873

if (InvCC != X86::COND_INVALID) {

44874

CC = InvCC;

44875

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

44876

DAG.getBitcast(OpVT, NotOp0), Op1);

44877

}

44878

}

44879

44880

if (CC == X86::COND_E || CC == X86::COND_NE) {

44881

// TESTZ(X,~Y) == TESTC(Y,X)

44882

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

44883

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

44884

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

44885

DAG.getBitcast(OpVT, NotOp1), Op0);

44886

}

44887

44888

if (Op0 == Op1) {

44889

SDValue BC = peekThroughBitcasts(Op0);

44890

EVT BCVT = BC.getValueType();

44891

assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44892, __extension__
__PRETTY_FUNCTION__))

44892

"Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44892, __extension__
__PRETTY_FUNCTION__));

44893

44894

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

44895

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

44896

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

44897

DAG.getBitcast(OpVT, BC.getOperand(0)),

44898

DAG.getBitcast(OpVT, BC.getOperand(1)));

44899

}

44900

44901

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

44902

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

44903

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

44904

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

44905

DAG.getBitcast(OpVT, BC.getOperand(0)),

44906

DAG.getBitcast(OpVT, BC.getOperand(1)));

44907

}

44908

44909

// If every element is an all-sign value, see if we can use MOVMSK to

44910

// more efficiently extract the sign bits and compare that.

44911

// TODO: Handle TESTC with comparison inversion.

44912

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

44913

// MOVMSK combines to make sure its never worse than PTEST?

44914

unsigned EltBits = BCVT.getScalarSizeInBits();

44915

if (DAG.ComputeNumSignBits(BC) == EltBits) {

44916

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44916, __extension__
__PRETTY_FUNCTION__));

44917

APInt SignMask = APInt::getSignMask(EltBits);

44918

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44919

if (SDValue Res =

44920

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

44921

// For vXi16 cases we need to use pmovmksb and extract every other

44922

// sign bit.

44923

SDLoc DL(EFLAGS);

44924

if (EltBits == 16) {

44925

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

44926

Res = DAG.getBitcast(MovmskVT, Res);

44927

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

44928

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

44929

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

44930

} else {

44931

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

44932

}

44933

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

44934

DAG.getConstant(0, DL, MVT::i32));

44935

}

44936

}

44937

}

44938

44939

// TESTZ(-1,X) == TESTZ(X,X)

44940

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

44941

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

44942

44943

// TESTZ(X,-1) == TESTZ(X,X)

44944

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

44945

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

44946

44947

// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

44948

// TODO: Add COND_NE handling?

44949

if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

44950

SDValue Src0 = peekThroughBitcasts(Op0);

44951

SDValue Src1 = peekThroughBitcasts(Op1);

44952

if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

44953

Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

44954

peekThroughBitcasts(Src0.getOperand(1)), true);

44955

Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

44956

peekThroughBitcasts(Src1.getOperand(1)), true);

44957

if (Src0 && Src1)

44958

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

44959

DAG.getBitcast(MVT::v4i64, Src0),

44960

DAG.getBitcast(MVT::v4i64, Src1));

44961

}

44962

}

44963

}

44964

44965

return SDValue();

44966

}

44967

44968

// Attempt to simplify the MOVMSK input based on the comparison type.

44969

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

44970

SelectionDAG &DAG,

44971

const X86Subtarget &Subtarget) {

44972

// Handle eq/ne against zero (any_of).

44973

// Handle eq/ne against -1 (all_of).

44974

if (!(CC == X86::COND_E || CC == X86::COND_NE))

44975

return SDValue();

44976

if (EFLAGS.getValueType() != MVT::i32)

44977

return SDValue();

44978

unsigned CmpOpcode = EFLAGS.getOpcode();

44979

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

44980

return SDValue();

44981

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

44982

if (!CmpConstant)

44983

return SDValue();

44984

const APInt &CmpVal = CmpConstant->getAPIntValue();

44985

44986

SDValue CmpOp = EFLAGS.getOperand(0);

44987

unsigned CmpBits = CmpOp.getValueSizeInBits();

44988

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44988, __extension__
__PRETTY_FUNCTION__));

44989

44990

// Peek through any truncate.

44991

if (CmpOp.getOpcode() == ISD::TRUNCATE)

44992

CmpOp = CmpOp.getOperand(0);

44993

44994

// Bail if we don't find a MOVMSK.

44995

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

44996

return SDValue();

44997

44998

SDValue Vec = CmpOp.getOperand(0);

44999

MVT VecVT = Vec.getSimpleValueType();

45000

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45001, __extension__
__PRETTY_FUNCTION__))

45001

"Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45001, __extension__
__PRETTY_FUNCTION__));

45002

unsigned NumElts = VecVT.getVectorNumElements();

45003

unsigned NumEltBits = VecVT.getScalarSizeInBits();

45004

45005

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

45006

bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

45007

NumElts <= CmpBits && CmpVal.isMask(NumElts);

45008

if (!IsAnyOf && !IsAllOf)

45009

return SDValue();

45010

45011

// See if we can peek through to a vector with a wider element type, if the

45012

// signbits extend down to all the sub-elements as well.

45013

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

45014

// potential SimplifyDemandedBits/Elts cases.

45015

// If we looked through a truncate that discard bits, we can't do this

45016

// transform.

45017

// FIXME: We could do this transform for truncates that discarded bits by

45018

// inserting an AND mask between the new MOVMSK and the CMP.

45019

if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

45020

SDValue BC = peekThroughBitcasts(Vec);

45021

MVT BCVT = BC.getSimpleValueType();

45022

unsigned BCNumElts = BCVT.getVectorNumElements();

45023

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

45024

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

45025

BCNumEltBits > NumEltBits &&

45026

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

45027

SDLoc DL(EFLAGS);

45028

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

45029

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

45030

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

45031

DAG.getConstant(CmpMask, DL, MVT::i32));

45032

}

45033

}

45034

45035

// MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).

45036

// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).

45037

// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).

45038

// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).

45039

if (VecVT.is256BitVector() && NumElts <= CmpBits) {

45040

SmallVector<SDValue> Ops;

45041

if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&

45042

Ops.size() == 2) {

45043

SDLoc DL(EFLAGS);

45044

EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

45045

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

45046

SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

45047

DAG.getBitcast(SubVT, Ops[0]),

45048

DAG.getBitcast(SubVT, Ops[1]));

45049

V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

45050

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

45051

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

45052

DAG.getConstant(CmpMask, DL, MVT::i32));

45053

}

45054

}

45055

45056

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

45057

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

45058

// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).

45059

// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).

45060

if (IsAllOf && Subtarget.hasSSE41()) {

45061

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

45062

SDValue BC = peekThroughBitcasts(Vec);

45063

// Ensure MOVMSK was testing every signbit of BC.

45064

if (BC.getValueType().getVectorNumElements() <= NumElts) {

45065

if (BC.getOpcode() == X86ISD::PCMPEQ) {

45066

SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),

45067

BC.getOperand(0), BC.getOperand(1));

45068

V = DAG.getBitcast(TestVT, V);

45069

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

45070

}

45071

// Check for 256-bit split vector cases.

45072

if (BC.getOpcode() == ISD::AND &&

45073

BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

45074

BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

45075

SDValue LHS = BC.getOperand(0);

45076

SDValue RHS = BC.getOperand(1);

45077

LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),

45078

LHS.getOperand(0), LHS.getOperand(1));

45079

RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),

45080

RHS.getOperand(0), RHS.getOperand(1));

45081

LHS = DAG.getBitcast(TestVT, LHS);

45082

RHS = DAG.getBitcast(TestVT, RHS);

45083

SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

45084

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

45085

}

45086

}

45087

}

45088

45089

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

45090

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

45091

// sign bits prior to the comparison with zero unless we know that

45092

// the vXi16 splats the sign bit down to the lower i8 half.

45093

// TODO: Handle all_of patterns.

45094

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

45095

SDValue VecOp0 = Vec.getOperand(0);

45096

SDValue VecOp1 = Vec.getOperand(1);

45097

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

45098

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

45099

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

45100

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

45101

SDLoc DL(EFLAGS);

45102

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

45103

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

45104

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

45105

if (!SignExt0) {

45106

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

45107

DAG.getConstant(0xAAAA, DL, MVT::i16));

45108

}

45109

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

45110

DAG.getConstant(0, DL, MVT::i16));

45111

}

45112

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

45113

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

45114

if (CmpBits >= 16 && Subtarget.hasInt256() &&

45115

(IsAnyOf || (SignExt0 && SignExt1))) {

45116

if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

45117

SDLoc DL(EFLAGS);

45118

SDValue Result = peekThroughBitcasts(Src);

45119

if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

45120

Result.getValueType().getVectorNumElements() <= NumElts) {

45121

SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),

45122

Result.getOperand(0), Result.getOperand(1));

45123

V = DAG.getBitcast(MVT::v4i64, V);

45124

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

45125

}

45126

Result = DAG.getBitcast(MVT::v32i8, Result);

45127

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

45128

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

45129

if (!SignExt0 || !SignExt1) {

45130

assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45131, __extension__
__PRETTY_FUNCTION__))

45131

"Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45131, __extension__
__PRETTY_FUNCTION__));

45132

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

45133

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

45134

}

45135

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

45136

DAG.getConstant(CmpMask, DL, MVT::i32));

45137

}

45138

}

45139

}

45140

45141

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

45142

SmallVector<int, 32> ShuffleMask;

45143

SmallVector<SDValue, 2> ShuffleInputs;

45144

if (NumElts <= CmpBits &&

45145

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

45146

ShuffleMask, DAG) &&

45147

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

45148

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

45149

unsigned NumShuffleElts = ShuffleMask.size();

45150

APInt DemandedElts = APInt::getZero(NumShuffleElts);

45151

for (int M : ShuffleMask) {

45152

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45152, __extension__
__PRETTY_FUNCTION__));

45153

DemandedElts.setBit(M);

45154

}

45155

if (DemandedElts.isAllOnes()) {

45156

SDLoc DL(EFLAGS);

45157

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

45158

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

45159

Result =

45160

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

45161

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

45162

EFLAGS.getOperand(1));

45163

}

45164

}

45165

45166

return SDValue();

45167

}

45168

45169

/// Optimize an EFLAGS definition used according to the condition code \p CC

45170

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

45171

/// uses of chain values.

45172

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

45173

SelectionDAG &DAG,

45174

const X86Subtarget &Subtarget) {

45175

if (CC == X86::COND_B)

45176

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

45177

return Flags;

45178

45179

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

45180

return R;

45181

45182

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

45183

return R;

45184

45185

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

45186

return R;

45187

45188

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

45189

}

45190

45191

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

45192

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

45193

TargetLowering::DAGCombinerInfo &DCI,

45194

const X86Subtarget &Subtarget) {

45195

SDLoc DL(N);

45196

45197

SDValue FalseOp = N->getOperand(0);

45198

SDValue TrueOp = N->getOperand(1);

45199

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

45200

SDValue Cond = N->getOperand(3);

45201

45202

// cmov X, X, ?, ? --> X

45203

if (TrueOp == FalseOp)

45204

return TrueOp;

45205

45206

// Try to simplify the EFLAGS and condition code operands.

45207

// We can't always do this as FCMOV only supports a subset of X86 cond.

45208

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

45209

if (!(FalseOp.getValueType() == MVT::f80 ||

45210

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

45211

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

45212

!Subtarget.canUseCMOV() || hasFPCMov(CC)) {

45213

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

45214

Flags};

45215

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

45216

}

45217

}

45218

45219

// If this is a select between two integer constants, try to do some

45220

// optimizations. Note that the operands are ordered the opposite of SELECT

45221

// operands.

45222

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

45223

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

45224

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

45225

// larger than FalseC (the false value).

45226

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

45227

CC = X86::GetOppositeBranchCondition(CC);

45228

std::swap(TrueC, FalseC);

45229

std::swap(TrueOp, FalseOp);

45230

}

45231

45232

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

45233

// This is efficient for any integer data type (including i8/i16) and

45234

// shift amount.

45235

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

45236

Cond = getSETCC(CC, Cond, DL, DAG);

45237

45238

// Zero extend the condition if needed.

45239

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

45240

45241

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

45242

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

45243

DAG.getConstant(ShAmt, DL, MVT::i8));

45244

return Cond;

45245

}

45246

45247

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

45248

// for any integer data type, including i8/i16.

45249

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

45250

Cond = getSETCC(CC, Cond, DL, DAG);

45251

45252

// Zero extend the condition if needed.

45253

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

45254

FalseC->getValueType(0), Cond);

45255

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

45256

SDValue(FalseC, 0));

45257

return Cond;

45258

}

45259

45260

// Optimize cases that will turn into an LEA instruction. This requires

45261

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

45262

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

45263

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

45264

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45265, __extension__
__PRETTY_FUNCTION__))

45265

"Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45265, __extension__
__PRETTY_FUNCTION__));

45266

45267

bool isFastMultiplier = false;

45268

if (Diff.ult(10)) {

45269

switch (Diff.getZExtValue()) {

45270

default: break;

45271

case 1: // result = add base, cond

45272

case 2: // result = lea base( , cond*2)

45273

case 3: // result = lea base(cond, cond*2)

45274

case 4: // result = lea base( , cond*4)

45275

case 5: // result = lea base(cond, cond*4)

45276

case 8: // result = lea base( , cond*8)

45277

case 9: // result = lea base(cond, cond*8)

45278

isFastMultiplier = true;

45279

break;

45280

}

45281

}

45282

45283

if (isFastMultiplier) {

45284

Cond = getSETCC(CC, Cond, DL ,DAG);

45285

// Zero extend the condition if needed.

45286

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

45287

Cond);

45288

// Scale the condition by the difference.

45289

if (Diff != 1)

45290

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

45291

DAG.getConstant(Diff, DL, Cond.getValueType()));

45292

45293

// Add the base if non-zero.

45294

if (FalseC->getAPIntValue() != 0)

45295

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

45296

SDValue(FalseC, 0));

45297

return Cond;

45298

}

45299

}

45300

}

45301

}

45302

45303

// Handle these cases:

45304

// (select (x != c), e, c) -> select (x != c), e, x),

45305

// (select (x == c), c, e) -> select (x == c), x, e)

45306

// where the c is an integer constant, and the "select" is the combination

45307

// of CMOV and CMP.

45308

//

45309

// The rationale for this change is that the conditional-move from a constant

45310

// needs two instructions, however, conditional-move from a register needs

45311

// only one instruction.

45312

//

45313

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

45314

// some instruction-combining opportunities. This opt needs to be

45315

// postponed as late as possible.

45316

//

45317

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

45318

// the DCI.xxxx conditions are provided to postpone the optimization as

45319

// late as possible.

45320

45321

ConstantSDNode *CmpAgainst = nullptr;

45322

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

45323

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

45324

!isa<ConstantSDNode>(Cond.getOperand(0))) {

45325

45326

if (CC == X86::COND_NE &&

45327

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

45328

CC = X86::GetOppositeBranchCondition(CC);

45329

std::swap(TrueOp, FalseOp);

45330

}

45331

45332

if (CC == X86::COND_E &&

45333

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

45334

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

45335

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

45336

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

45337

}

45338

}

45339

}

45340

45341

// Fold and/or of setcc's to double CMOV:

45342

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

45343

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

45344

//

45345

// This combine lets us generate:

45346

// cmovcc1 (jcc1 if we don't have CMOV)

45347

// cmovcc2 (same)

45348

// instead of:

45349

// setcc1

45350

// setcc2

45351

// and/or

45352

// cmovne (jne if we don't have CMOV)

45353

// When we can't use the CMOV instruction, it might increase branch

45354

// mispredicts.

45355

// When we can use CMOV, or when there is no mispredict, this improves

45356

// throughput and reduces register pressure.

45357

//

45358

if (CC == X86::COND_NE) {

45359

SDValue Flags;

45360

X86::CondCode CC0, CC1;

45361

bool isAndSetCC;

45362

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

45363

if (isAndSetCC) {

45364

std::swap(FalseOp, TrueOp);

45365

CC0 = X86::GetOppositeBranchCondition(CC0);

45366

CC1 = X86::GetOppositeBranchCondition(CC1);

45367

}

45368

45369

SDValue LOps[] = {FalseOp, TrueOp,

45370

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

45371

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

45372

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

45373

Flags};

45374

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

45375

return CMOV;

45376

}

45377

}

45378

45379

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

45380

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

45381

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

45382

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

45383

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

45384

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

45385

SDValue Add = TrueOp;

45386

SDValue Const = FalseOp;

45387

// Canonicalize the condition code for easier matching and output.

45388

if (CC == X86::COND_E)

45389

std::swap(Add, Const);

45390

45391

// We might have replaced the constant in the cmov with the LHS of the

45392

// compare. If so change it to the RHS of the compare.

45393

if (Const == Cond.getOperand(0))

45394

Const = Cond.getOperand(1);

45395

45396

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

45397

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

45398

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

45399

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

45400

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

45401

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

45402

EVT VT = N->getValueType(0);

45403

// This should constant fold.

45404

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

45405

SDValue CMov =

45406

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

45407

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

45408

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

45409

}

45410

}

45411

45412

return SDValue();

45413

}

45414

45415

/// Different mul shrinking modes.

45416

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

45417

45418

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

45419

EVT VT = N->getOperand(0).getValueType();

45420

if (VT.getScalarSizeInBits() != 32)

45421

return false;

45422

45423

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45423, __extension__
__PRETTY_FUNCTION__));

45424

unsigned SignBits[2] = {1, 1};

45425

bool IsPositive[2] = {false, false};

45426

for (unsigned i = 0; i < 2; i++) {

45427

SDValue Opd = N->getOperand(i);

45428

45429

SignBits[i] = DAG.ComputeNumSignBits(Opd);

45430

IsPositive[i] = DAG.SignBitIsZero(Opd);

45431

}

45432

45433

bool AllPositive = IsPositive[0] && IsPositive[1];

45434

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

45435

// When ranges are from -128 ~ 127, use MULS8 mode.

45436

if (MinSignBits >= 25)

45437

Mode = ShrinkMode::MULS8;

45438

// When ranges are from 0 ~ 255, use MULU8 mode.

45439

else if (AllPositive && MinSignBits >= 24)

45440

Mode = ShrinkMode::MULU8;

45441

// When ranges are from -32768 ~ 32767, use MULS16 mode.

45442

else if (MinSignBits >= 17)

45443

Mode = ShrinkMode::MULS16;

45444

// When ranges are from 0 ~ 65535, use MULU16 mode.

45445

else if (AllPositive && MinSignBits >= 16)

45446

Mode = ShrinkMode::MULU16;

45447

else

45448

return false;

45449

return true;

45450

}

45451

45452

/// When the operands of vector mul are extended from smaller size values,

45453

/// like i8 and i16, the type of mul may be shrinked to generate more

45454

/// efficient code. Two typical patterns are handled:

45455

/// Pattern1:

45456

/// %2 = sext/zext <N x i8> %1 to <N x i32>

45457

/// %4 = sext/zext <N x i8> %3 to <N x i32>

45458

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

45459

/// %5 = mul <N x i32> %2, %4

45460

///

45461

/// Pattern2:

45462

/// %2 = zext/sext <N x i16> %1 to <N x i32>

45463

/// %4 = zext/sext <N x i16> %3 to <N x i32>

45464

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

45465

/// %5 = mul <N x i32> %2, %4

45466

///

45467

/// There are four mul shrinking modes:

45468

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

45469

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

45470

/// generate pmullw+sext32 for it (MULS8 mode).

45471

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

45472

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

45473

/// generate pmullw+zext32 for it (MULU8 mode).

45474

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

45475

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

45476

/// generate pmullw+pmulhw for it (MULS16 mode).

45477

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

45478

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

45479

/// generate pmullw+pmulhuw for it (MULU16 mode).

45480

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

45481

const X86Subtarget &Subtarget) {

45482

// Check for legality

45483

// pmullw/pmulhw are not supported by SSE.

45484

if (!Subtarget.hasSSE2())

45485

return SDValue();

45486

45487

// Check for profitability

45488

// pmulld is supported since SSE41. It is better to use pmulld

45489

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

45490

// the expansion.

45491

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

45492

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

45493

return SDValue();

45494

45495

ShrinkMode Mode;

45496

if (!canReduceVMulWidth(N, DAG, Mode))

45497

return SDValue();

45498

45499

SDLoc DL(N);

45500

SDValue N0 = N->getOperand(0);

45501

SDValue N1 = N->getOperand(1);

45502

EVT VT = N->getOperand(0).getValueType();

45503

unsigned NumElts = VT.getVectorNumElements();

45504

if ((NumElts % 2) != 0)

45505

return SDValue();

45506

45507

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

45508

45509

// Shrink the operands of mul.

45510

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

45511

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

45512

45513

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

45514

// lower part is needed.

45515

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

45516

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

45517

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

45518

: ISD::SIGN_EXTEND,

45519

DL, VT, MulLo);

45520

45521

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

45522

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

45523

// the higher part is also needed.

45524

SDValue MulHi =

45525

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

45526

ReducedVT, NewN0, NewN1);

45527

45528

// Repack the lower part and higher part result of mul into a wider

45529

// result.

45530

// Generate shuffle functioning as punpcklwd.

45531

SmallVector<int, 16> ShuffleMask(NumElts);

45532

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

45533

ShuffleMask[2 * i] = i;

45534

ShuffleMask[2 * i + 1] = i + NumElts;

45535

}

45536

SDValue ResLo =

45537

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

45538

ResLo = DAG.getBitcast(ResVT, ResLo);

45539

// Generate shuffle functioning as punpckhwd.

45540

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

45541

ShuffleMask[2 * i] = i + NumElts / 2;

45542

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

45543

}

45544

SDValue ResHi =

45545

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

45546

ResHi = DAG.getBitcast(ResVT, ResHi);

45547

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

45548

}

45549

45550

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

45551

EVT VT, const SDLoc &DL) {

45552

45553

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

45554

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

45555

DAG.getConstant(Mult, DL, VT));

45556

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

45557

DAG.getConstant(Shift, DL, MVT::i8));

45558

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

45559

N->getOperand(0));

45560

return Result;

45561

};

45562

45563

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

45564

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

45565

DAG.getConstant(Mul1, DL, VT));

45566

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

45567

DAG.getConstant(Mul2, DL, VT));

45568

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

45569

N->getOperand(0));

45570

return Result;

45571

};

45572

45573

switch (MulAmt) {

45574

default:

45575

break;

45576

case 11:

45577

// mul x, 11 => add ((shl (mul x, 5), 1), x)

45578

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

45579

case 21:

45580

// mul x, 21 => add ((shl (mul x, 5), 2), x)

45581

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

45582

case 41:

45583

// mul x, 41 => add ((shl (mul x, 5), 3), x)

45584

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

45585

case 22:

45586

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

45587

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

45588

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

45589

case 19:

45590

// mul x, 19 => add ((shl (mul x, 9), 1), x)

45591

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

45592

case 37:

45593

// mul x, 37 => add ((shl (mul x, 9), 2), x)

45594

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

45595

case 73:

45596

// mul x, 73 => add ((shl (mul x, 9), 3), x)

45597

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

45598

case 13:

45599

// mul x, 13 => add ((shl (mul x, 3), 2), x)

45600

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

45601

case 23:

45602

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

45603

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

45604

case 26:

45605

// mul x, 26 => add ((mul (mul x, 5), 5), x)

45606

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

45607

case 28:

45608

// mul x, 28 => add ((mul (mul x, 9), 3), x)

45609

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

45610

case 29:

45611

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

45612

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

45613

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

45614

}

45615

45616

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

45617

// by a single LEA.

45618

// First check if this a sum of two power of 2s because that's easy. Then

45619

// count how many zeros are up to the first bit.

45620

// TODO: We can do this even without LEA at a cost of two shifts and an add.

45621

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

45622

unsigned ScaleShift = countTrailingZeros(MulAmt);

45623

if (ScaleShift >= 1 && ScaleShift < 4) {

45624

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

45625

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45626

DAG.getConstant(ShiftAmt, DL, MVT::i8));

45627

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45628

DAG.getConstant(ScaleShift, DL, MVT::i8));

45629

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

45630

}

45631

}

45632

45633

return SDValue();

45634

}

45635

45636

// If the upper 17 bits of either element are zero and the other element are

45637

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

45638

// PMULLD, except on KNL.

45639

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

45640

const X86Subtarget &Subtarget) {

45641

if (!Subtarget.hasSSE2())

45642

return SDValue();

45643

45644

if (Subtarget.isPMADDWDSlow())

45645

return SDValue();

45646

45647

EVT VT = N->getValueType(0);

45648

45649

// Only support vXi32 vectors.

45650

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

45651

return SDValue();

45652

45653

// Make sure the type is legal or can split/widen to a legal type.

45654

// With AVX512 but without BWI, we would need to split v32i16.

45655

unsigned NumElts = VT.getVectorNumElements();

45656

if (NumElts == 1 || !isPowerOf2_32(NumElts))

45657

return SDValue();

45658

45659

EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);

45660

45661

// With AVX512 but without BWI, we would need to split v32i16.

45662

if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

45663

return SDValue();

45664

45665

SDValue N0 = N->getOperand(0);

45666

SDValue N1 = N->getOperand(1);

45667

45668

// If we are zero/sign extending two steps without SSE4.1, its better to

45669

// reduce the vmul width instead.

45670

if (!Subtarget.hasSSE41() &&

45671

(((N0.getOpcode() == ISD::ZERO_EXTEND &&

45672

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

45673

(N1.getOpcode() == ISD::ZERO_EXTEND &&

45674

N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

45675

((N0.getOpcode() == ISD::SIGN_EXTEND &&

45676

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

45677

(N1.getOpcode() == ISD::SIGN_EXTEND &&

45678

N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

45679

return SDValue();

45680

45681

// If we are sign extending a wide vector without SSE4.1, its better to reduce

45682

// the vmul width instead.

45683

if (!Subtarget.hasSSE41() &&

45684

(N0.getOpcode() == ISD::SIGN_EXTEND &&

45685

N0.getOperand(0).getValueSizeInBits() > 128) &&

45686

(N1.getOpcode() == ISD::SIGN_EXTEND &&

45687

N1.getOperand(0).getValueSizeInBits() > 128))

45688

return SDValue();

45689

45690

// Sign bits must extend down to the lowest i16.

45691

if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

45692

DAG.ComputeMaxSignificantBits(N0) > 16)

45693

return SDValue();

45694

45695

// At least one of the elements must be zero in the upper 17 bits, or can be

45696

// safely made zero without altering the final result.

45697

auto GetZeroableOp = [&](SDValue Op) {

45698

APInt Mask17 = APInt::getHighBitsSet(32, 17);

45699

if (DAG.MaskedValueIsZero(Op, Mask17))

45700

return Op;

45701

// Mask off upper 16-bits of sign-extended constants.

45702

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

45703

return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,

45704

DAG.getConstant(0xFFFF, SDLoc(N), VT));

45705

if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

45706

SDValue Src = Op.getOperand(0);

45707

// Convert sext(vXi16) to zext(vXi16).

45708

if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

45709

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

45710

// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

45711

// which will expand the extension.

45712

if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

45713

EVT ExtVT = VT.changeVectorElementType(MVT::i16);

45714

Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);

45715

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

45716

}

45717

}

45718

// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

45719

if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

45720

N->isOnlyUserOf(Op.getNode())) {

45721

SDValue Src = Op.getOperand(0);

45722

if (Src.getScalarValueSizeInBits() == 16)

45723

return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);

45724

}

45725

// Convert VSRAI(Op, 16) to VSRLI(Op, 16).

45726

if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

45727

N->isOnlyUserOf(Op.getNode())) {

45728

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),

45729

Op.getOperand(1));

45730

}

45731

return SDValue();

45732

};

45733

SDValue ZeroN0 = GetZeroableOp(N0);

45734

SDValue ZeroN1 = GetZeroableOp(N1);

45735

if (!ZeroN0 && !ZeroN1)

45736

return SDValue();

45737

N0 = ZeroN0 ? ZeroN0 : N0;

45738

N1 = ZeroN1 ? ZeroN1 : N1;

45739

45740

// Use SplitOpsAndApply to handle AVX splitting.

45741

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45742

ArrayRef<SDValue> Ops) {

45743

MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

45744

return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);

45745

};

45746

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

45747

{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },

45748

PMADDWDBuilder);

45749

}

45750

45751

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

45752

const X86Subtarget &Subtarget) {

45753

if (!Subtarget.hasSSE2())

45754

return SDValue();

45755

45756

EVT VT = N->getValueType(0);

45757

45758

// Only support vXi64 vectors.

45759

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

45760

VT.getVectorNumElements() < 2 ||

45761

!isPowerOf2_32(VT.getVectorNumElements()))

45762

return SDValue();

45763

45764

SDValue N0 = N->getOperand(0);

45765

SDValue N1 = N->getOperand(1);

45766

45767

// MULDQ returns the 64-bit result of the signed multiplication of the lower

45768

// 32-bits. We can lower with this if the sign bits stretch that far.

45769

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

45770

DAG.ComputeNumSignBits(N1) > 32) {

45771

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45772

ArrayRef<SDValue> Ops) {

45773

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

45774

};

45775

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

45776

PMULDQBuilder, /*CheckBWI*/false);

45777

}

45778

45779

// If the upper bits are zero we can use a single pmuludq.

45780

APInt Mask = APInt::getHighBitsSet(64, 32);

45781

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

45782

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45783

ArrayRef<SDValue> Ops) {

45784

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

45785

};

45786

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

45787

PMULUDQBuilder, /*CheckBWI*/false);

45788

}

45789

45790

return SDValue();

45791

}

45792

45793

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

45794

TargetLowering::DAGCombinerInfo &DCI,

45795

const X86Subtarget &Subtarget) {

45796

EVT VT = N->getValueType(0);

45797

45798

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

45799

return V;

45800

45801

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

45802

return V;

45803

45804

if (DCI.isBeforeLegalize() && VT.isVector())

45805

return reduceVMULWidth(N, DAG, Subtarget);

45806

45807

// Optimize a single multiply with constant into two operations in order to

45808

// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

45809

if (!MulConstantOptimization)

45810

return SDValue();

45811

45812

// An imul is usually smaller than the alternative sequence.

45813

if (DAG.getMachineFunction().getFunction().hasMinSize())

45814

return SDValue();

45815

45816

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

45817

return SDValue();

45818

45819

if (VT != MVT::i64 && VT != MVT::i32)

45820

return SDValue();

45821

45822

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

45823

if (!C)

45824

return SDValue();

45825

if (isPowerOf2_64(C->getZExtValue()))

45826

return SDValue();

45827

45828

int64_t SignMulAmt = C->getSExtValue();

45829

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45829, __extension__
__PRETTY_FUNCTION__));

45830

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

45831

45832

SDLoc DL(N);

45833

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

45834

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

45835

DAG.getConstant(AbsMulAmt, DL, VT));

45836

if (SignMulAmt < 0)

45837

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

45838

NewMul);

45839

45840

return NewMul;

45841

}

45842

45843

uint64_t MulAmt1 = 0;

45844

uint64_t MulAmt2 = 0;

45845

if ((AbsMulAmt % 9) == 0) {

45846

MulAmt1 = 9;

45847

MulAmt2 = AbsMulAmt / 9;

45848

} else if ((AbsMulAmt % 5) == 0) {

45849

MulAmt1 = 5;

45850

MulAmt2 = AbsMulAmt / 5;

45851

} else if ((AbsMulAmt % 3) == 0) {

45852

MulAmt1 = 3;

45853

MulAmt2 = AbsMulAmt / 3;

45854

}

45855

45856

SDValue NewMul;

45857

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

45858

if (MulAmt2 &&

45859

(isPowerOf2_64(MulAmt2) ||

45860

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

45861

45862

if (isPowerOf2_64(MulAmt2) &&

45863

!(SignMulAmt >= 0 && N->hasOneUse() &&

45864

N->use_begin()->getOpcode() == ISD::ADD))

45865

// If second multiplifer is pow2, issue it first. We want the multiply by

45866

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

45867

// is an add. Only do this for positive multiply amounts since the

45868

// negate would prevent it from being used as an address mode anyway.

45869

std::swap(MulAmt1, MulAmt2);

45870

45871

if (isPowerOf2_64(MulAmt1))

45872

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45873

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

45874

else

45875

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

45876

DAG.getConstant(MulAmt1, DL, VT));

45877

45878

if (isPowerOf2_64(MulAmt2))

45879

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

45880

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

45881

else

45882

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

45883

DAG.getConstant(MulAmt2, DL, VT));

45884

45885

// Negate the result.

45886

if (SignMulAmt < 0)

45887

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

45888

NewMul);

45889

} else if (!Subtarget.slowLEA())

45890

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

45891

45892

if (!NewMul) {

45893

assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))

45894

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))

45895

"Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__))

45896

"already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45896, __extension__
__PRETTY_FUNCTION__));

45897

if (isPowerOf2_64(AbsMulAmt - 1)) {

45898

// (mul x, 2^N + 1) => (add (shl x, N), x)

45899

NewMul = DAG.getNode(

45900

ISD::ADD, DL, VT, N->getOperand(0),

45901

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45902

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

45903

MVT::i8)));

45904

// To negate, subtract the number from zero

45905

if (SignMulAmt < 0)

45906

NewMul = DAG.getNode(ISD::SUB, DL, VT,

45907

DAG.getConstant(0, DL, VT), NewMul);

45908

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

45909

// (mul x, 2^N - 1) => (sub (shl x, N), x)

45910

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45911

DAG.getConstant(Log2_64(AbsMulAmt + 1),

45912

DL, MVT::i8));

45913

// To negate, reverse the operands of the subtract.

45914

if (SignMulAmt < 0)

45915

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

45916

else

45917

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

45918

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

45919

// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)

45920

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45921

DAG.getConstant(Log2_64(AbsMulAmt - 2),

45922

DL, MVT::i8));

45923

NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));

45924

NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));

45925

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

45926

// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)

45927

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

45928

DAG.getConstant(Log2_64(AbsMulAmt + 2),

45929

DL, MVT::i8));

45930

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

45931

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

45932

}

45933

}

45934

45935

return NewMul;

45936

}

45937

45938

// Try to form a MULHU or MULHS node by looking for

45939

// (srl (mul ext, ext), 16)

45940

// TODO: This is X86 specific because we want to be able to handle wide types

45941

// before type legalization. But we can only do it if the vector will be

45942

// legalized via widening/splitting. Type legalization can't handle promotion

45943

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

45944

// combiner.

45945

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

45946

const X86Subtarget &Subtarget) {

45947

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45948, __extension__
__PRETTY_FUNCTION__))

45948

"SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45948, __extension__
__PRETTY_FUNCTION__));

45949

SDLoc DL(N);

45950

45951

if (!Subtarget.hasSSE2())

45952

return SDValue();

45953

45954

// The operation feeding into the shift must be a multiply.

45955

SDValue ShiftOperand = N->getOperand(0);

45956

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

45957

return SDValue();

45958

45959

// Input type should be at least vXi32.

45960

EVT VT = N->getValueType(0);

45961

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

45962

return SDValue();

45963

45964

// Need a shift by 16.

45965

APInt ShiftAmt;

45966

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

45967

ShiftAmt != 16)

45968

return SDValue();

45969

45970

SDValue LHS = ShiftOperand.getOperand(0);

45971

SDValue RHS = ShiftOperand.getOperand(1);

45972

45973

unsigned ExtOpc = LHS.getOpcode();

45974

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

45975

RHS.getOpcode() != ExtOpc)

45976

return SDValue();

45977

45978

// Peek through the extends.

45979

LHS = LHS.getOperand(0);

45980

RHS = RHS.getOperand(0);

45981

45982

// Ensure the input types match.

45983

EVT MulVT = LHS.getValueType();

45984

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

45985

return SDValue();

45986

45987

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

45988

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

45989

45990

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

45991

return DAG.getNode(ExtOpc, DL, VT, Mulh);

45992

}

45993

45994

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

45995

SDValue N0 = N->getOperand(0);

45996

SDValue N1 = N->getOperand(1);

45997

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

45998

EVT VT = N0.getValueType();

45999

46000

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

46001

// since the result of setcc_c is all zero's or all ones.

46002

if (VT.isInteger() && !VT.isVector() &&

46003

N1C && N0.getOpcode() == ISD::AND &&

46004

N0.getOperand(1).getOpcode() == ISD::Constant) {

46005

SDValue N00 = N0.getOperand(0);

46006

APInt Mask = N0.getConstantOperandAPInt(1);

46007

Mask <<= N1C->getAPIntValue();

46008

bool MaskOK = false;

46009

// We can handle cases concerning bit-widening nodes containing setcc_c if

46010

// we carefully interrogate the mask to make sure we are semantics

46011

// preserving.

46012

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

46013

// of the underlying setcc_c operation if the setcc_c was zero extended.

46014

// Consider the following example:

46015

// zext(setcc_c) -> i32 0x0000FFFF

46016

// c1 -> i32 0x0000FFFF

46017

// c2 -> i32 0x00000001

46018

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

46019

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

46020

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

46021

MaskOK = true;

46022

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

46023

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

46024

MaskOK = true;

46025

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

46026

N00.getOpcode() == ISD::ANY_EXTEND) &&

46027

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

46028

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

46029

}

46030

if (MaskOK && Mask != 0) {

46031

SDLoc DL(N);

46032

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

46033

}

46034

}

46035

46036

// Hardware support for vector shifts is sparse which makes us scalarize the

46037

// vector operations in many cases. Also, on sandybridge ADD is faster than

46038

// shl.

46039

// (shl V, 1) -> add V,V

46040

if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))

46041

if (auto *N1SplatC = N1BV->getConstantSplatNode()) {

46042

assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast <bool> (N0.getValueType().isVector() &&
"Invalid vector shift type") ? void (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46042, __extension__
__PRETTY_FUNCTION__));

46043

// We shift all of the values by one. In many cases we do not have

46044

// hardware support for this operation. This is better expressed as an ADD

46045

// of two values.

46046

if (N1SplatC->isOne())

46047

return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);

46048

}

46049

46050

return SDValue();

46051

}

46052

46053

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

46054

const X86Subtarget &Subtarget) {

46055

SDValue N0 = N->getOperand(0);

46056

SDValue N1 = N->getOperand(1);

46057

EVT VT = N0.getValueType();

46058

unsigned Size = VT.getSizeInBits();

46059

46060

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

46061

return V;

46062

46063

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

46064

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

46065

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

46066

// depending on sign of (SarConst - [56,48,32,24,16])

46067

46068

// sexts in X86 are MOVs. The MOVs have the same code size

46069

// as above SHIFTs (only SHIFT on 1 has lower code size).

46070

// However the MOVs have 2 advantages to a SHIFT:

46071

// 1. MOVs can write to a register that differs from source

46072

// 2. MOVs accept memory operands

46073

46074

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

46075

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

46076

N0.getOperand(1).getOpcode() != ISD::Constant)

46077

return SDValue();

46078

46079

SDValue N00 = N0.getOperand(0);

46080

SDValue N01 = N0.getOperand(1);

46081

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

46082

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

46083

EVT CVT = N1.getValueType();

46084

46085

if (SarConst.isNegative())

46086

return SDValue();

46087

46088

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

46089

unsigned ShiftSize = SVT.getSizeInBits();

46090

// skipping types without corresponding sext/zext and

46091

// ShlConst that is not one of [56,48,32,24,16]

46092

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

46093

continue;

46094

SDLoc DL(N);

46095

SDValue NN =

46096

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

46097

SarConst = SarConst - (Size - ShiftSize);

46098

if (SarConst == 0)

46099

return NN;

46100

if (SarConst.isNegative())

46101

return DAG.getNode(ISD::SHL, DL, VT, NN,

46102

DAG.getConstant(-SarConst, DL, CVT));

46103

return DAG.getNode(ISD::SRA, DL, VT, NN,

46104

DAG.getConstant(SarConst, DL, CVT));

46105

}

46106

return SDValue();

46107

}

46108

46109

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

46110

TargetLowering::DAGCombinerInfo &DCI,

46111

const X86Subtarget &Subtarget) {

46112

SDValue N0 = N->getOperand(0);

46113

SDValue N1 = N->getOperand(1);

46114

EVT VT = N0.getValueType();

46115

46116

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

46117

return V;

46118

46119

// Only do this on the last DAG combine as it can interfere with other

46120

// combines.

46121

if (!DCI.isAfterLegalizeDAG())

46122

return SDValue();

46123

46124

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

46125

// TODO: This is a generic DAG combine that became an x86-only combine to

46126

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

46127

// and-not ('andn').

46128

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

46129

return SDValue();

46130

46131

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

46132

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

46133

if (!ShiftC || !AndC)

46134

return SDValue();

46135

46136

// If we can shrink the constant mask below 8-bits or 32-bits, then this

46137

// transform should reduce code size. It may also enable secondary transforms

46138

// from improved known-bits analysis or instruction selection.

46139

APInt MaskVal = AndC->getAPIntValue();

46140

46141

// If this can be matched by a zero extend, don't optimize.

46142

if (MaskVal.isMask()) {

46143

unsigned TO = MaskVal.countTrailingOnes();

46144

if (TO >= 8 && isPowerOf2_32(TO))

46145

return SDValue();

46146

}

46147

46148

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

46149

unsigned OldMaskSize = MaskVal.getMinSignedBits();

46150

unsigned NewMaskSize = NewMaskVal.getMinSignedBits();

46151

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

46152

(OldMaskSize > 32 && NewMaskSize <= 32)) {

46153

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

46154

SDLoc DL(N);

46155

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

46156

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

46157

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

46158

}

46159

return SDValue();

46160

}

46161

46162

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

46163

const X86Subtarget &Subtarget) {

46164

unsigned Opcode = N->getOpcode();

46165

assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46165, __extension__
__PRETTY_FUNCTION__));

46166

46167

SDLoc DL(N);

46168

EVT VT = N->getValueType(0);

46169

SDValue N0 = N->getOperand(0);

46170

SDValue N1 = N->getOperand(1);

46171

EVT SrcVT = N0.getValueType();

46172

46173

SDValue BC0 =

46174

N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

46175

SDValue BC1 =

46176

N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;

46177

46178

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

46179

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

46180

// truncation trees that help us avoid lane crossing shuffles.

46181

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

46182

// TODO: We don't handle vXf64 shuffles yet.

46183

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

46184

if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

46185

SmallVector<SDValue> ShuffleOps;

46186

SmallVector<int> ShuffleMask, ScaledMask;

46187

SDValue Vec = peekThroughBitcasts(BCSrc);

46188

if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

46189

resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

46190

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

46191

// shuffle to a v4X64 width - we can probably relax this in the future.

46192

if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

46193

ShuffleOps[0].getValueType().is256BitVector() &&

46194

scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

46195

SDValue Lo, Hi;

46196

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

46197

std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

46198

Lo = DAG.getBitcast(SrcVT, Lo);

46199

Hi = DAG.getBitcast(SrcVT, Hi);

46200

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

46201

Res = DAG.getBitcast(ShufVT, Res);

46202

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

46203

return DAG.getBitcast(VT, Res);

46204

}

46205

}

46206

}

46207

}

46208

46209

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

46210

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

46211

// If either/both ops are a shuffle that can scale to v2x64,

46212

// then see if we can perform this as a v4x32 post shuffle.

46213

SmallVector<SDValue> Ops0, Ops1;

46214

SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

46215

bool IsShuf0 =

46216

getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

46217

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

46218

all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

46219

bool IsShuf1 =

46220

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

46221

scaleShuffleElements(Mask1, 2, ScaledMask1) &&

46222

all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

46223

if (IsShuf0 || IsShuf1) {

46224

if (!IsShuf0) {

46225

Ops0.assign({BC0});

46226

ScaledMask0.assign({0, 1});

46227

}

46228

if (!IsShuf1) {

46229

Ops1.assign({BC1});

46230

ScaledMask1.assign({0, 1});

46231

}

46232

46233

SDValue LHS, RHS;

46234

int PostShuffle[4] = {-1, -1, -1, -1};

46235

auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

46236

if (M < 0)

46237

return true;

46238

Idx = M % 2;

46239

SDValue Src = Ops[M / 2];

46240

if (!LHS || LHS == Src) {

46241

LHS = Src;

46242

return true;

46243

}

46244

if (!RHS || RHS == Src) {

46245

Idx += 2;

46246

RHS = Src;

46247

return true;

46248

}

46249

return false;

46250

};

46251

if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

46252

FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

46253

FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

46254

FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

46255

LHS = DAG.getBitcast(SrcVT, LHS);

46256

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

46257

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

46258

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

46259

Res = DAG.getBitcast(ShufVT, Res);

46260

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

46261

return DAG.getBitcast(VT, Res);

46262

}

46263

}

46264

}

46265

46266

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

46267

if (VT.is256BitVector() && Subtarget.hasInt256()) {

46268

SmallVector<int> Mask0, Mask1;

46269

SmallVector<SDValue> Ops0, Ops1;

46270

SmallVector<int, 2> ScaledMask0, ScaledMask1;

46271

if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

46272

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

46273

!Ops0.empty() && !Ops1.empty() &&

46274

all_of(Ops0,

46275

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

46276

all_of(Ops1,

46277

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

46278

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

46279

scaleShuffleElements(Mask1, 2, ScaledMask1)) {

46280

SDValue Op00 = peekThroughBitcasts(Ops0.front());

46281

SDValue Op10 = peekThroughBitcasts(Ops1.front());

46282

SDValue Op01 = peekThroughBitcasts(Ops0.back());

46283

SDValue Op11 = peekThroughBitcasts(Ops1.back());

46284

if ((Op00 == Op11) && (Op01 == Op10)) {

46285

std::swap(Op10, Op11);

46286

ShuffleVectorSDNode::commuteMask(ScaledMask1);

46287

}

46288

if ((Op00 == Op10) && (Op01 == Op11)) {

46289

const int Map[4] = {0, 2, 1, 3};

46290

SmallVector<int, 4> ShuffleMask(

46291

{Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

46292

Map[ScaledMask1[1]]});

46293

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

46294

SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

46295

DAG.getBitcast(SrcVT, Op01));

46296

Res = DAG.getBitcast(ShufVT, Res);

46297

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

46298

return DAG.getBitcast(VT, Res);

46299

}

46300

}

46301

}

46302

46303

return SDValue();

46304

}

46305

46306

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

46307

TargetLowering::DAGCombinerInfo &DCI,

46308

const X86Subtarget &Subtarget) {

46309

unsigned Opcode = N->getOpcode();

46310

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46311, __extension__
__PRETTY_FUNCTION__))

46311

"Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46311, __extension__
__PRETTY_FUNCTION__));

46312

46313

EVT VT = N->getValueType(0);

46314

SDValue N0 = N->getOperand(0);

46315

SDValue N1 = N->getOperand(1);

46316

unsigned NumDstElts = VT.getVectorNumElements();

46317

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

46318

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

46319

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))

46320

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__))

46321

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46321, __extension__
__PRETTY_FUNCTION__));

46322

46323

bool IsSigned = (X86ISD::PACKSS == Opcode);

46324

46325

// Constant Folding.

46326

APInt UndefElts0, UndefElts1;

46327

SmallVector<APInt, 32> EltBits0, EltBits1;

46328

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

46329

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

46330

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

46331

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

46332

unsigned NumLanes = VT.getSizeInBits() / 128;

46333

unsigned NumSrcElts = NumDstElts / 2;

46334

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

46335

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

46336

46337

APInt Undefs(NumDstElts, 0);

46338

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

46339

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

46340

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

46341

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

46342

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

46343

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

46344

46345

if (UndefElts[SrcIdx]) {

46346

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

46347

continue;

46348

}

46349

46350

APInt &Val = EltBits[SrcIdx];

46351

if (IsSigned) {

46352

// PACKSS: Truncate signed value with signed saturation.

46353

// Source values less than dst minint are saturated to minint.

46354

// Source values greater than dst maxint are saturated to maxint.

46355

if (Val.isSignedIntN(DstBitsPerElt))

46356

Val = Val.trunc(DstBitsPerElt);

46357

else if (Val.isNegative())

46358

Val = APInt::getSignedMinValue(DstBitsPerElt);

46359

else

46360

Val = APInt::getSignedMaxValue(DstBitsPerElt);

46361

} else {

46362

// PACKUS: Truncate signed value with unsigned saturation.

46363

// Source values less than zero are saturated to zero.

46364

// Source values greater than dst maxuint are saturated to maxuint.

46365

if (Val.isIntN(DstBitsPerElt))

46366

Val = Val.trunc(DstBitsPerElt);

46367

else if (Val.isNegative())

46368

Val = APInt::getZero(DstBitsPerElt);

46369

else

46370

Val = APInt::getAllOnes(DstBitsPerElt);

46371

}

46372

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

46373

}

46374

}

46375

46376

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

46377

}

46378

46379

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

46380

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

46381

return V;

46382

46383

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

46384

// truncate to create a larger truncate.

46385

if (Subtarget.hasAVX512() &&

46386

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

46387

N0.getOperand(0).getValueType() == MVT::v8i32) {

46388

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

46389

(!IsSigned &&

46390

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

46391

if (Subtarget.hasVLX())

46392

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

46393

46394

// Widen input to v16i32 so we can truncate that.

46395

SDLoc dl(N);

46396

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

46397

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

46398

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

46399

}

46400

}

46401

46402

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

46403

if (VT.is128BitVector()) {

46404

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

46405

SDValue Src0, Src1;

46406

if (N0.getOpcode() == ExtOpc &&

46407

N0.getOperand(0).getValueType().is64BitVector() &&

46408

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

46409

Src0 = N0.getOperand(0);

46410

}

46411

if (N1.getOpcode() == ExtOpc &&

46412

N1.getOperand(0).getValueType().is64BitVector() &&

46413

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

46414

Src1 = N1.getOperand(0);

46415

}

46416

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

46417

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46417, __extension__
__PRETTY_FUNCTION__));

46418

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

46419

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

46420

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

46421

}

46422

46423

// Try again with pack(*_extend_vector_inreg, undef).

46424

unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

46425

: ISD::ZERO_EXTEND_VECTOR_INREG;

46426

if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

46427

N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

46428

return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

46429

DAG);

46430

}

46431

46432

// Attempt to combine as shuffle.

46433

SDValue Op(N, 0);

46434

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

46435

return Res;

46436

46437

return SDValue();

46438

}

46439

46440

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

46441

TargetLowering::DAGCombinerInfo &DCI,

46442

const X86Subtarget &Subtarget) {

46443

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))

46444

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__))

46445

"Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46445, __extension__
__PRETTY_FUNCTION__));

46446

46447

if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

46448

MVT VT = N->getSimpleValueType(0);

46449

SDValue LHS = N->getOperand(0);

46450

SDValue RHS = N->getOperand(1);

46451

46452

// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

46453

if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

46454

LHS.getOpcode() == RHS.getOpcode() &&

46455

LHS.getValueType() == RHS.getValueType() &&

46456

N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

46457

SDValue LHS0 = LHS.getOperand(0);

46458

SDValue LHS1 = LHS.getOperand(1);

46459

SDValue RHS0 = RHS.getOperand(0);

46460

SDValue RHS1 = RHS.getOperand(1);

46461

if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

46462

(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

46463

SDLoc DL(N);

46464

SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

46465

LHS0.isUndef() ? LHS1 : LHS0,

46466

RHS0.isUndef() ? RHS1 : RHS0);

46467

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

46468

Res = DAG.getBitcast(ShufVT, Res);

46469

SDValue NewLHS =

46470

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

46471

getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

46472

SDValue NewRHS =

46473

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

46474

getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

46475

return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

46476

DAG.getBitcast(VT, NewRHS));

46477

}

46478

}

46479

}

46480

46481

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

46482

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

46483

return V;

46484

46485

return SDValue();

46486

}

46487

46488

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

46489

TargetLowering::DAGCombinerInfo &DCI,

46490

const X86Subtarget &Subtarget) {

46491

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))

46492

X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__))

46493

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46493, __extension__
__PRETTY_FUNCTION__));

46494

EVT VT = N->getValueType(0);

46495

SDValue N0 = N->getOperand(0);

46496

SDValue N1 = N->getOperand(1);

46497

46498

// Shift zero -> zero.

46499

if (ISD::isBuildVectorAllZeros(N0.getNode()))

46500

return DAG.getConstant(0, SDLoc(N), VT);

46501

46502

// Detect constant shift amounts.

46503

APInt UndefElts;

46504

SmallVector<APInt, 32> EltBits;

46505

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

46506

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

46507

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

46508

EltBits[0].getZExtValue(), DAG);

46509

}

46510

46511

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46512

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

46513

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

46514

return SDValue(N, 0);

46515

46516

return SDValue();

46517

}

46518

46519

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

46520

TargetLowering::DAGCombinerInfo &DCI,

46521

const X86Subtarget &Subtarget) {

46522

unsigned Opcode = N->getOpcode();

46523

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))

46524

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__))

46525

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46525, __extension__
__PRETTY_FUNCTION__));

46526

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

46527

EVT VT = N->getValueType(0);

46528

SDValue N0 = N->getOperand(0);

46529

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

46530

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46531, __extension__
__PRETTY_FUNCTION__))

46531

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46531, __extension__
__PRETTY_FUNCTION__));

46532

assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46533, __extension__
__PRETTY_FUNCTION__))

46533

"Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46533, __extension__
__PRETTY_FUNCTION__));

46534

46535

// (shift undef, X) -> 0

46536

if (N0.isUndef())

46537

return DAG.getConstant(0, SDLoc(N), VT);

46538

46539

// Out of range logical bit shifts are guaranteed to be zero.

46540

// Out of range arithmetic bit shifts splat the sign bit.

46541

unsigned ShiftVal = N->getConstantOperandVal(1);

46542

if (ShiftVal >= NumBitsPerElt) {

46543

if (LogicalShift)

46544

return DAG.getConstant(0, SDLoc(N), VT);

46545

ShiftVal = NumBitsPerElt - 1;

46546

}

46547

46548

// (shift X, 0) -> X

46549

if (!ShiftVal)

46550

return N0;

46551

46552

// (shift 0, C) -> 0

46553

if (ISD::isBuildVectorAllZeros(N0.getNode()))

46554

// N0 is all zeros or undef. We guarantee that the bits shifted into the

46555

// result are all zeros, not undef.

46556

return DAG.getConstant(0, SDLoc(N), VT);

46557

46558

// (VSRAI -1, C) -> -1

46559

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

46560

// N0 is all ones or undef. We guarantee that the bits shifted into the

46561

// result are all ones, not undef.

46562

return DAG.getConstant(-1, SDLoc(N), VT);

46563

46564

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

46565

if (Opcode == N0.getOpcode()) {

46566

unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();

46567

unsigned NewShiftVal = ShiftVal + ShiftVal2;

46568

if (NewShiftVal >= NumBitsPerElt) {

46569

// Out of range logical bit shifts are guaranteed to be zero.

46570

// Out of range arithmetic bit shifts splat the sign bit.

46571

if (LogicalShift)

46572

return DAG.getConstant(0, SDLoc(N), VT);

46573

NewShiftVal = NumBitsPerElt - 1;

46574

}

46575

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

46576

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

46577

}

46578

46579

// We can decode 'whole byte' logical bit shifts as shuffles.

46580

if (LogicalShift && (ShiftVal % 8) == 0) {

46581

SDValue Op(N, 0);

46582

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

46583

return Res;

46584

}

46585

46586

// Constant Folding.

46587

APInt UndefElts;

46588

SmallVector<APInt, 32> EltBits;

46589

if (N->isOnlyUserOf(N0.getNode()) &&

46590

getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {

46591

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))

46592

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__));

46593

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

46594

// created an undef input due to no input bits being demanded, but user

46595

// still expects 0 in other bits.

46596

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

46597

APInt &Elt = EltBits[i];

46598

if (UndefElts[i])

46599

Elt = 0;

46600

else if (X86ISD::VSHLI == Opcode)

46601

Elt <<= ShiftVal;

46602

else if (X86ISD::VSRAI == Opcode)

46603

Elt.ashrInPlace(ShiftVal);

46604

else

46605

Elt.lshrInPlace(ShiftVal);

46606

}

46607

// Reset undef elements since they were zeroed above.

46608

UndefElts = 0;

46609

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

46610

}

46611

46612

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46613

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

46614

DCI))

46615

return SDValue(N, 0);

46616

46617

return SDValue();

46618

}

46619

46620

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

46621

TargetLowering::DAGCombinerInfo &DCI,

46622

const X86Subtarget &Subtarget) {

46623

EVT VT = N->getValueType(0);

46624

assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))

46625

(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))

46626

N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__))

46627

"Unexpected vector insertion")(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46627, __extension__
__PRETTY_FUNCTION__));

46628

46629

if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {

46630

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

46631

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46632

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

46633

APInt::getAllOnes(NumBitsPerElt), DCI))

46634

return SDValue(N, 0);

46635

}

46636

46637

// Attempt to combine insertion patterns to a shuffle.

46638

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

46639

SDValue Op(N, 0);

46640

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

46641

return Res;

46642

}

46643

46644

return SDValue();

46645

}

46646

46647

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

46648

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

46649

/// OR -> CMPNEQSS.

46650

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

46651

TargetLowering::DAGCombinerInfo &DCI,

46652

const X86Subtarget &Subtarget) {

46653

unsigned opcode;

46654

46655

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

46656

// we're requiring SSE2 for both.

46657

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

46658

SDValue N0 = N->getOperand(0);

46659

SDValue N1 = N->getOperand(1);

46660

SDValue CMP0 = N0.getOperand(1);

46661

SDValue CMP1 = N1.getOperand(1);

46662

SDLoc DL(N);

46663

46664

// The SETCCs should both refer to the same CMP.

46665

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

46666

return SDValue();

46667

46668

SDValue CMP00 = CMP0->getOperand(0);

46669

SDValue CMP01 = CMP0->getOperand(1);

46670

EVT VT = CMP00.getValueType();

46671

46672

if (VT == MVT::f32 || VT == MVT::f64 ||

46673

(VT == MVT::f16 && Subtarget.hasFP16())) {

46674

bool ExpectingFlags = false;

46675

// Check for any users that want flags:

46676

for (const SDNode *U : N->uses()) {

46677

if (ExpectingFlags)

46678

break;

46679

46680

switch (U->getOpcode()) {

46681

default:

46682

case ISD::BR_CC:

46683

case ISD::BRCOND:

46684

case ISD::SELECT:

46685

ExpectingFlags = true;

46686

break;

46687

case ISD::CopyToReg:

46688

case ISD::SIGN_EXTEND:

46689

case ISD::ZERO_EXTEND:

46690

case ISD::ANY_EXTEND:

46691

break;

46692

}

46693

}

46694

46695

if (!ExpectingFlags) {

46696

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

46697

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

46698

46699

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

46700

X86::CondCode tmp = cc0;

46701

cc0 = cc1;

46702

cc1 = tmp;

46703

}

46704

46705

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

46706

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

46707

// FIXME: need symbolic constants for these magic numbers.

46708

// See X86ATTInstPrinter.cpp:printSSECC().

46709

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

46710

if (Subtarget.hasAVX512()) {

46711

SDValue FSetCC =

46712

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

46713

DAG.getTargetConstant(x86cc, DL, MVT::i8));

46714

// Need to fill with zeros to ensure the bitcast will produce zeroes

46715

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

46716

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

46717

DAG.getConstant(0, DL, MVT::v16i1),

46718

FSetCC, DAG.getIntPtrConstant(0, DL));

46719

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

46720

N->getSimpleValueType(0));

46721

}

46722

SDValue OnesOrZeroesF =

46723

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

46724

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

46725

46726

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

46727

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

46728

46729

if (is64BitFP && !Subtarget.is64Bit()) {

46730

// On a 32-bit target, we cannot bitcast the 64-bit float to a

46731

// 64-bit integer, since that's not a legal type. Since

46732

// OnesOrZeroesF is all ones or all zeroes, we don't need all the

46733

// bits, but can do this little dance to extract the lowest 32 bits

46734

// and work with those going forward.

46735

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

46736

OnesOrZeroesF);

46737

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

46738

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

46739

Vector32, DAG.getIntPtrConstant(0, DL));

46740

IntVT = MVT::i32;

46741

}

46742

46743

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

46744

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

46745

DAG.getConstant(1, DL, IntVT));

46746

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

46747

ANDed);

46748

return OneBitOfTruth;

46749

}

46750

}

46751

}

46752

}

46753

return SDValue();

46754

}

46755

46756

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

46757

static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {

46758

assert(N->getOpcode() == ISD::AND)(static_cast <bool> (N->getOpcode() == ISD::AND) ? void
(0) : __assert_fail ("N->getOpcode() == ISD::AND", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46758, __extension__ __PRETTY_FUNCTION__));

46759

46760

MVT VT = N->getSimpleValueType(0);

46761

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

46762

return SDValue();

46763

46764

SDValue X, Y;

46765

SDValue N0 = N->getOperand(0);

46766

SDValue N1 = N->getOperand(1);

46767

46768

auto GetNot = [&VT, &DAG](SDValue V) {

46769

// Basic X = NOT(Y) detection.

46770

if (SDValue Not = IsNOT(V, DAG))

46771

return Not;

46772

// Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).

46773

if (V.getOpcode() == X86ISD::VBROADCAST) {

46774

SDValue Src = V.getOperand(0);

46775

EVT SrcVT = Src.getValueType();

46776

if (!SrcVT.isVector())

46777

return SDValue();

46778

if (SDValue Not = IsNOT(Src, DAG))

46779

return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,

46780

DAG.getBitcast(SrcVT, Not));

46781

}

46782

return SDValue();

46783

};

46784

46785

if (SDValue Not = GetNot(N0)) {

46786

X = Not;

46787

Y = N1;

46788

} else if (SDValue Not = GetNot(N1)) {

46789

X = Not;

46790

Y = N0;

46791

} else

46792

return SDValue();

46793

46794

X = DAG.getBitcast(VT, X);

46795

Y = DAG.getBitcast(VT, Y);

46796

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

46797

}

46798

46799

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

46800

// logical operations, like in the example below.

46801

// or (and (truncate x, truncate y)),

46802

// (xor (truncate z, build_vector (constants)))

46803

// Given a target type \p VT, we generate

46804

// or (and x, y), (xor z, zext(build_vector (constants)))

46805

// given x, y and z are of type \p VT. We can do so, if operands are either

46806

// truncates from VT types, the second operand is a vector of constants or can

46807

// be recursively promoted.

46808

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

46809

unsigned Depth) {

46810

// Limit recursion to avoid excessive compile times.

46811

if (Depth >= SelectionDAG::MaxRecursionDepth)

46812

return SDValue();

46813

46814

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

46815

N->getOpcode() != ISD::OR)

46816

return SDValue();

46817

46818

SDValue N0 = N->getOperand(0);

46819

SDValue N1 = N->getOperand(1);

46820

SDLoc DL(N);

46821

46822

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46823

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

46824

return SDValue();

46825

46826

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

46827

N0 = NN0;

46828

else {

46829

// The Left side has to be a trunc.

46830

if (N0.getOpcode() != ISD::TRUNCATE)

46831

return SDValue();

46832

46833

// The type of the truncated inputs.

46834

if (N0.getOperand(0).getValueType() != VT)

46835

return SDValue();

46836

46837

N0 = N0.getOperand(0);

46838

}

46839

46840

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

46841

N1 = NN1;

46842

else {

46843

// The right side has to be a 'trunc' or a constant vector.

46844

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

46845

N1.getOperand(0).getValueType() == VT;

46846

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

46847

return SDValue();

46848

46849

if (RHSTrunc)

46850

N1 = N1.getOperand(0);

46851

else

46852

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

46853

}

46854

46855

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

46856

}

46857

46858

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

46859

// register. In most cases we actually compare or select YMM-sized registers

46860

// and mixing the two types creates horrible code. This method optimizes

46861

// some of the transition sequences.

46862

// Even with AVX-512 this is still useful for removing casts around logical

46863

// operations on vXi1 mask types.

46864

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

46865

const X86Subtarget &Subtarget) {

46866

EVT VT = N->getValueType(0);

46867

assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46867, __extension__
__PRETTY_FUNCTION__));

46868

46869

SDLoc DL(N);

46870

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))

46871

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__))

46872

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46872, __extension__
__PRETTY_FUNCTION__));

46873

46874

SDValue Narrow = N->getOperand(0);

46875

EVT NarrowVT = Narrow.getValueType();

46876

46877

// Generate the wide operation.

46878

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

46879

if (!Op)

46880

return SDValue();

46881

switch (N->getOpcode()) {

46882

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46882);

46883

case ISD::ANY_EXTEND:

46884

return Op;

46885

case ISD::ZERO_EXTEND:

46886

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

46887

case ISD::SIGN_EXTEND:

46888

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

46889

Op, DAG.getValueType(NarrowVT));

46890

}

46891

}

46892

46893

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

46894

unsigned FPOpcode;

46895

switch (Opcode) {

46896

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46896);

46897

case ISD::AND: FPOpcode = X86ISD::FAND; break;

46898

case ISD::OR: FPOpcode = X86ISD::FOR; break;

46899

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

46900

}

46901

return FPOpcode;

46902

}

46903

46904

/// If both input operands of a logic op are being cast from floating-point

46905

/// types or FP compares, try to convert this into a floating-point logic node

46906

/// to avoid unnecessary moves from SSE to integer registers.

46907

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

46908

TargetLowering::DAGCombinerInfo &DCI,

46909

const X86Subtarget &Subtarget) {

46910

EVT VT = N->getValueType(0);

46911

SDValue N0 = N->getOperand(0);

46912

SDValue N1 = N->getOperand(1);

46913

SDLoc DL(N);

46914

46915

if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

46916

(N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

46917

return SDValue();

46918

46919

SDValue N00 = N0.getOperand(0);

46920

SDValue N10 = N1.getOperand(0);

46921

EVT N00Type = N00.getValueType();

46922

EVT N10Type = N10.getValueType();

46923

46924

// Ensure that both types are the same and are legal scalar fp types.

46925

if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

46926

(Subtarget.hasSSE2() && N00Type == MVT::f64) ||

46927

(Subtarget.hasFP16() && N00Type == MVT::f16)))

46928

return SDValue();

46929

46930

if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

46931

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

46932

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

46933

return DAG.getBitcast(VT, FPLogic);

46934

}

46935

46936

if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

46937

!N1.hasOneUse())

46938

return SDValue();

46939

46940

ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

46941

ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();

46942

46943

// The vector ISA for FP predicates is incomplete before AVX, so converting

46944

// COMIS* to CMPS* may not be a win before AVX.

46945

if (!Subtarget.hasAVX() &&

46946

!(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

46947

return SDValue();

46948

46949

// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

46950

// and vector logic:

46951

// logic (setcc N00, N01), (setcc N10, N11) -->

46952

// extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

46953

unsigned NumElts = 128 / N00Type.getSizeInBits();

46954

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

46955

EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

46956

SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

46957

SDValue N01 = N0.getOperand(1);

46958

SDValue N11 = N1.getOperand(1);

46959

SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

46960

SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

46961

SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

46962

SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

46963

SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

46964

SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

46965

SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);

46966

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

46967

}

46968

46969

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

46970

// to reduce XMM->GPR traffic.

46971

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

46972

unsigned Opc = N->getOpcode();

46973

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46974, __extension__
__PRETTY_FUNCTION__))

46974

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46974, __extension__
__PRETTY_FUNCTION__));

46975

46976

SDValue N0 = N->getOperand(0);

46977

SDValue N1 = N->getOperand(1);

46978

46979

// Both operands must be single use MOVMSK.

46980

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

46981

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

46982

return SDValue();

46983

46984

SDValue Vec0 = N0.getOperand(0);

46985

SDValue Vec1 = N1.getOperand(0);

46986

EVT VecVT0 = Vec0.getValueType();

46987

EVT VecVT1 = Vec1.getValueType();

46988

46989

// Both MOVMSK operands must be from vectors of the same size and same element

46990

// size, but its OK for a fp/int diff.

46991

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

46992

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

46993

return SDValue();

46994

46995

SDLoc DL(N);

46996

unsigned VecOpc =

46997

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

46998

SDValue Result =

46999

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

47000

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47001

}

47002

47003

// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

47004

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

47005

// handles in InstCombine.

47006

static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {

47007

unsigned Opc = N->getOpcode();

47008

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47009, __extension__
__PRETTY_FUNCTION__))

47009

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47009, __extension__
__PRETTY_FUNCTION__));

47010

47011

SDValue N0 = N->getOperand(0);

47012

SDValue N1 = N->getOperand(1);

47013

EVT VT = N->getValueType(0);

47014

47015

// Both operands must be single use.

47016

if (!N0.hasOneUse() || !N1.hasOneUse())

47017

return SDValue();

47018

47019

// Search for matching shifts.

47020

SDValue BC0 = peekThroughOneUseBitcasts(N0);

47021

SDValue BC1 = peekThroughOneUseBitcasts(N1);

47022

47023

unsigned BCOpc = BC0.getOpcode();

47024

EVT BCVT = BC0.getValueType();

47025

if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

47026

return SDValue();

47027

47028

switch (BCOpc) {

47029

case X86ISD::VSHLI:

47030

case X86ISD::VSRLI:

47031

case X86ISD::VSRAI: {

47032

if (BC0.getOperand(1) != BC1.getOperand(1))

47033

return SDValue();

47034

47035

SDLoc DL(N);

47036

SDValue BitOp =

47037

DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

47038

SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

47039

return DAG.getBitcast(VT, Shift);

47040

}

47041

}

47042

47043

return SDValue();

47044

}

47045

47046

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

47047

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

47048

/// with a shift-right to eliminate loading the vector constant mask value.

47049

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

47050

const X86Subtarget &Subtarget) {

47051

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

47052

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

47053

EVT VT = Op0.getValueType();

47054

if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

47055

return SDValue();

47056

47057

// Try to convert an "is positive" signbit masking operation into arithmetic

47058

// shift and "andn". This saves a materialization of a -1 vector constant.

47059

// The "is negative" variant should be handled more generally because it only

47060

// requires "and" rather than "andn":

47061

// and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

47062

//

47063

// This is limited to the original type to avoid producing even more bitcasts.

47064

// If the bitcasts can't be eliminated, then it is unlikely that this fold

47065

// will be profitable.

47066

if (N->getValueType(0) == VT &&

47067

supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {

47068

SDValue X, Y;

47069

if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&

47070

isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {

47071

X = Op1.getOperand(0);

47072

Y = Op0;

47073

} else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&

47074

isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {

47075

X = Op0.getOperand(0);

47076

Y = Op1;

47077

}

47078

if (X && Y) {

47079

SDLoc DL(N);

47080

SDValue Sra =

47081

getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

47082

VT.getScalarSizeInBits() - 1, DAG);

47083

return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

47084

}

47085

}

47086

47087

APInt SplatVal;

47088

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

47089

!SplatVal.isMask())

47090

return SDValue();

47091

47092

// Don't prevent creation of ANDN.

47093

if (isBitwiseNot(Op0))

47094

return SDValue();

47095

47096

if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))

47097

return SDValue();

47098

47099

unsigned EltBitWidth = VT.getScalarSizeInBits();

47100

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

47101

return SDValue();

47102

47103

SDLoc DL(N);

47104

unsigned ShiftVal = SplatVal.countTrailingOnes();

47105

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

47106

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

47107

return DAG.getBitcast(N->getValueType(0), Shift);

47108

}

47109

47110

// Get the index node from the lowered DAG of a GEP IR instruction with one

47111

// indexing dimension.

47112

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

47113

if (Ld->isIndexed())

47114

return SDValue();

47115

47116

SDValue Base = Ld->getBasePtr();

47117

47118

if (Base.getOpcode() != ISD::ADD)

47119

return SDValue();

47120

47121

SDValue ShiftedIndex = Base.getOperand(0);

47122

47123

if (ShiftedIndex.getOpcode() != ISD::SHL)

47124

return SDValue();

47125

47126

return ShiftedIndex.getOperand(0);

47127

47128

}

47129

47130

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

47131

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

47132

switch (VT.getSizeInBits()) {

47133

default: return false;

47134

case 64: return Subtarget.is64Bit() ? true : false;

47135

case 32: return true;

47136

}

47137

}

47138

return false;

47139

}

47140

47141

// This function recognizes cases where X86 bzhi instruction can replace and

47142

// 'and-load' sequence.

47143

// In case of loading integer value from an array of constants which is defined

47144

// as follows:

47145

//

47146

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

47147

//

47148

// then applying a bitwise and on the result with another input.

47149

// It's equivalent to performing bzhi (zero high bits) on the input, with the

47150

// same index of the load.

47151

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

47152

const X86Subtarget &Subtarget) {

47153

MVT VT = Node->getSimpleValueType(0);

47154

SDLoc dl(Node);

47155

47156

// Check if subtarget has BZHI instruction for the node's type

47157

if (!hasBZHI(Subtarget, VT))

47158

return SDValue();

47159

47160

// Try matching the pattern for both operands.

47161

for (unsigned i = 0; i < 2; i++) {

47162

SDValue N = Node->getOperand(i);

47163

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

47164

47165

// continue if the operand is not a load instruction

47166

if (!Ld)

47167

return SDValue();

47168

47169

const Value *MemOp = Ld->getMemOperand()->getValue();

47170

47171

if (!MemOp)

47172

return SDValue();

47173

47174

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

47175

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

47176

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

47177

47178

Constant *Init = GV->getInitializer();

47179

Type *Ty = Init->getType();

47180

if (!isa<ConstantDataArray>(Init) ||

47181

!Ty->getArrayElementType()->isIntegerTy() ||

47182

Ty->getArrayElementType()->getScalarSizeInBits() !=

47183

VT.getSizeInBits() ||

47184

Ty->getArrayNumElements() >

47185

Ty->getArrayElementType()->getScalarSizeInBits())

47186

continue;

47187

47188

// Check if the array's constant elements are suitable to our case.

47189

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

47190

bool ConstantsMatch = true;

47191

for (uint64_t j = 0; j < ArrayElementCount; j++) {

47192

auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

47193

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

47194

ConstantsMatch = false;

47195

break;

47196

}

47197

}

47198

if (!ConstantsMatch)

47199

continue;

47200

47201

// Do the transformation (For 32-bit type):

47202

// -> (and (load arr[idx]), inp)

47203

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

47204

// that will be replaced with one bzhi instruction.

47205

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

47206

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

47207

47208

// Get the Node which indexes into the array.

47209

SDValue Index = getIndexFromUnindexedLoad(Ld);

47210

if (!Index)

47211

return SDValue();

47212

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

47213

47214

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

47215

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

47216

47217

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

47218

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

47219

47220

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

47221

}

47222

}

47223

}

47224

}

47225

return SDValue();

47226

}

47227

47228

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

47229

// Where C is a mask containing the same number of bits as the setcc and

47230

// where the setcc will freely 0 upper bits of k-register. We can replace the

47231

// undef in the concat with 0s and remove the AND. This mainly helps with

47232

// v2i1/v4i1 setcc being casted to scalar.

47233

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

47234

const X86Subtarget &Subtarget) {

47235

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47235, __extension__
__PRETTY_FUNCTION__));

47236

47237

EVT VT = N->getValueType(0);

47238

47239

// Make sure this is an AND with constant. We will check the value of the

47240

// constant later.

47241

if (!isa<ConstantSDNode>(N->getOperand(1)))

47242

return SDValue();

47243

47244

// This is implied by the ConstantSDNode.

47245

assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47245, __extension__
__PRETTY_FUNCTION__));

47246

47247

if (N->getOperand(0).getOpcode() != ISD::BITCAST ||

47248

!N->getOperand(0).hasOneUse() ||

47249

!N->getOperand(0).getOperand(0).hasOneUse())

47250

return SDValue();

47251

47252

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47253

SDValue Src = N->getOperand(0).getOperand(0);

47254

EVT SrcVT = Src.getValueType();

47255

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

47256

!TLI.isTypeLegal(SrcVT))

47257

return SDValue();

47258

47259

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

47260

return SDValue();

47261

47262

// We only care about the first subvector of the concat, we expect the

47263

// other subvectors to be ignored due to the AND if we make the change.

47264

SDValue SubVec = Src.getOperand(0);

47265

EVT SubVecVT = SubVec.getValueType();

47266

47267

// First subvector should be a setcc with a legal result type. The RHS of the

47268

// AND should be a mask with this many bits.

47269

if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||

47270

!N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))

47271

return SDValue();

47272

47273

EVT SetccVT = SubVec.getOperand(0).getValueType();

47274

if (!TLI.isTypeLegal(SetccVT) ||

47275

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

47276

return SDValue();

47277

47278

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

47279

return SDValue();

47280

47281

// We passed all the checks. Rebuild the concat_vectors with zeroes

47282

// and cast it back to VT.

47283

SDLoc dl(N);

47284

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

47285

DAG.getConstant(0, dl, SubVecVT));

47286

Ops[0] = SubVec;

47287

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

47288

Ops);

47289

return DAG.getBitcast(VT, Concat);

47290

}

47291

47292

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

47293

TargetLowering::DAGCombinerInfo &DCI,

47294

const X86Subtarget &Subtarget) {

47295

SDValue N0 = N->getOperand(0);

47296

SDValue N1 = N->getOperand(1);

47297

EVT VT = N->getValueType(0);

47298

SDLoc dl(N);

47299

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47300

47301

// If this is SSE1 only convert to FAND to avoid scalarization.

47302

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

47303

return DAG.getBitcast(MVT::v4i32,

47304

DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

47305

DAG.getBitcast(MVT::v4f32, N0),

47306

DAG.getBitcast(MVT::v4f32, N1)));

47307

}

47308

47309

// Use a 32-bit and+zext if upper bits known zero.

47310

if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

47311

APInt HiMask = APInt::getHighBitsSet(64, 32);

47312

if (DAG.MaskedValueIsZero(N1, HiMask) ||

47313

DAG.MaskedValueIsZero(N0, HiMask)) {

47314

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

47315

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

47316

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

47317

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

47318

}

47319

}

47320

47321

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

47322

// TODO: Support multiple SrcOps.

47323

if (VT == MVT::i1) {

47324

SmallVector<SDValue, 2> SrcOps;

47325

SmallVector<APInt, 2> SrcPartials;

47326

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

47327

SrcOps.size() == 1) {

47328

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

47329

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

47330

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

47331

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

47332

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

47333

if (Mask) {

47334

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47335, __extension__
__PRETTY_FUNCTION__))

47335

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47335, __extension__
__PRETTY_FUNCTION__));

47336

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

47337

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

47338

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

47339

}

47340

}

47341

}

47342

47343

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

47344

return V;

47345

47346

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

47347

return R;

47348

47349

if (SDValue R = combineBitOpWithShift(N, DAG))

47350

return R;

47351

47352

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

47353

return FPLogic;

47354

47355

if (DCI.isBeforeLegalizeOps())

47356

return SDValue();

47357

47358

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

47359

return R;

47360

47361

if (SDValue R = combineAndNotIntoANDNP(N, DAG))

47362

return R;

47363

47364

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

47365

return ShiftRight;

47366

47367

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

47368

return R;

47369

47370

// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

47371

// avoids slow variable shift (moving shift amount to ECX etc.)

47372

if (isOneConstant(N1) && N0->hasOneUse()) {

47373

SDValue Src = N0;

47374

while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

47375

Src.getOpcode() == ISD::TRUNCATE) &&

47376

Src.getOperand(0)->hasOneUse())

47377

Src = Src.getOperand(0);

47378

X86::CondCode X86CC = X86::COND_B;

47379

// Peek through AND(NOT(SRL(X,Y)),1).

47380

if (isBitwiseNot(Src)) {

47381

Src = Src.getOperand(0);

47382

X86CC = X86::COND_AE;

47383

}

47384

if (Src.getOpcode() == ISD::SRL &&

47385

!isa<ConstantSDNode>(Src.getOperand(1))) {

47386

SDValue BitNo = Src.getOperand(1);

47387

Src = Src.getOperand(0);

47388

// Peek through AND(SRL(NOT(X),Y),1).

47389

if (isBitwiseNot(Src)) {

47390

Src = Src.getOperand(0);

47391

X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

47392

}

47393

if (SDValue BT = getBT(Src, BitNo, dl, DAG))

47394

return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

47395

}

47396

}

47397

47398

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

47399

// Attempt to recursively combine a bitmask AND with shuffles.

47400

SDValue Op(N, 0);

47401

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

47402

return Res;

47403

47404

// If either operand is a constant mask, then only the elements that aren't

47405

// zero are actually demanded by the other operand.

47406

auto GetDemandedMasks = [&](SDValue Op) {

47407

APInt UndefElts;

47408

SmallVector<APInt> EltBits;

47409

int NumElts = VT.getVectorNumElements();

47410

int EltSizeInBits = VT.getScalarSizeInBits();

47411

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

47412

APInt DemandedElts = APInt::getAllOnes(NumElts);

47413

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

47414

EltBits)) {

47415

DemandedBits.clearAllBits();

47416

DemandedElts.clearAllBits();

47417

for (int I = 0; I != NumElts; ++I)

47418

if (!EltBits[I].isZero()) {

47419

DemandedBits |= EltBits[I];

47420

DemandedElts.setBit(I);

47421

}

47422

}

47423

return std::make_pair(DemandedBits, DemandedElts);

47424

};

47425

std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);

47426

std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0);

47427

47428

if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||

47429

TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||

47430

TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||

47431

TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {

47432

if (N->getOpcode() != ISD::DELETED_NODE)

47433

DCI.AddToWorklist(N);

47434

return SDValue(N, 0);

47435

}

47436

47437

SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first,

47438

Demand0.second, DAG);

47439

SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first,

47440

Demand1.second, DAG);

47441

if (NewN0 || NewN1)

47442

return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

47443

NewN1 ? NewN1 : N1);

47444

}

47445

47446

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

47447

if ((VT.getScalarSizeInBits() % 8) == 0 &&

47448

N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

47449

isa<ConstantSDNode>(N0.getOperand(1))) {

47450

SDValue BitMask = N1;

47451

SDValue SrcVec = N0.getOperand(0);

47452

EVT SrcVecVT = SrcVec.getValueType();

47453

47454

// Check that the constant bitmask masks whole bytes.

47455

APInt UndefElts;

47456

SmallVector<APInt, 64> EltBits;

47457

if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

47458

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

47459

llvm::all_of(EltBits, [](const APInt &M) {

47460

return M.isZero() || M.isAllOnes();

47461

})) {

47462

unsigned NumElts = SrcVecVT.getVectorNumElements();

47463

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

47464

unsigned Idx = N0.getConstantOperandVal(1);

47465

47466

// Create a root shuffle mask from the byte mask and the extracted index.

47467

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

47468

for (unsigned i = 0; i != Scale; ++i) {

47469

if (UndefElts[i])

47470

continue;

47471

int VecIdx = Scale * Idx + i;

47472

ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

47473

}

47474

47475

if (SDValue Shuffle = combineX86ShufflesRecursively(

47476

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

47477

X86::MaxShuffleCombineDepth,

47478

/*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,

47479

/*AllowVarPerLaneMask*/ true, DAG, Subtarget))

47480

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

47481

N0.getOperand(1));

47482

}

47483

}

47484

47485

return SDValue();

47486

}

47487

47488

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

47489

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

47490

const X86Subtarget &Subtarget) {

47491

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47491, __extension__
__PRETTY_FUNCTION__));

47492

47493

MVT VT = N->getSimpleValueType(0);

47494

unsigned EltSizeInBits = VT.getScalarSizeInBits();

47495

if (!VT.isVector() || (EltSizeInBits % 8) != 0)

47496

return SDValue();

47497

47498

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

47499

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

47500

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

47501

return SDValue();

47502

47503

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

47504

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

47505

if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

47506

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

47507

return SDValue();

47508

47509

// Attempt to extract constant byte masks.

47510

APInt UndefElts0, UndefElts1;

47511

SmallVector<APInt, 32> EltBits0, EltBits1;

47512

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

47513

false, false))

47514

return SDValue();

47515

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

47516

false, false))

47517

return SDValue();

47518

47519

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

47520

// TODO - add UNDEF elts support.

47521

if (UndefElts0[i] || UndefElts1[i])

47522

return SDValue();

47523

if (EltBits0[i] != ~EltBits1[i])

47524

return SDValue();

47525

}

47526

47527

SDLoc DL(N);

47528

47529

if (useVPTERNLOG(Subtarget, VT)) {

47530

// Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

47531

// VPTERNLOG is only available as vXi32/64-bit types.

47532

MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;

47533

MVT OpVT =

47534

MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

47535

SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

47536

SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

47537

SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

47538

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

47539

SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

47540

DAG, Subtarget);

47541

return DAG.getBitcast(VT, Res);

47542

}

47543

47544

SDValue X = N->getOperand(0);

47545

SDValue Y =

47546

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

47547

DAG.getBitcast(VT, N1.getOperand(0)));

47548

return DAG.getNode(ISD::OR, DL, VT, X, Y);

47549

}

47550

47551

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

47552

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

47553

if (N->getOpcode() != ISD::OR)

47554

return false;

47555

47556

SDValue N0 = N->getOperand(0);

47557

SDValue N1 = N->getOperand(1);

47558

47559

// Canonicalize AND to LHS.

47560

if (N1.getOpcode() == ISD::AND)

47561

std::swap(N0, N1);

47562

47563

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

47564

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

47565

return false;

47566

47567

Mask = N1.getOperand(0);

47568

X = N1.getOperand(1);

47569

47570

// Check to see if the mask appeared in both the AND and ANDNP.

47571

if (N0.getOperand(0) == Mask)

47572

Y = N0.getOperand(1);

47573

else if (N0.getOperand(1) == Mask)

47574

Y = N0.getOperand(0);

47575

else

47576

return false;

47577

47578

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

47579

// ANDNP combine allows other combines to happen that prevent matching.

47580

return true;

47581

}

47582

47583

// Try to fold:

47584

// (or (and (m, y), (pandn m, x)))

47585

// into:

47586

// (vselect m, x, y)

47587

// As a special case, try to fold:

47588

// (or (and (m, (sub 0, x)), (pandn m, x)))

47589

// into:

47590

// (sub (xor X, M), M)

47591

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

47592

const X86Subtarget &Subtarget) {

47593

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47593, __extension__
__PRETTY_FUNCTION__));

47594

47595

EVT VT = N->getValueType(0);

47596

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

47597

(VT.is256BitVector() && Subtarget.hasInt256())))

47598

return SDValue();

47599

47600

SDValue X, Y, Mask;

47601

if (!matchLogicBlend(N, X, Y, Mask))

47602

return SDValue();

47603

47604

// Validate that X, Y, and Mask are bitcasts, and see through them.

47605

Mask = peekThroughBitcasts(Mask);

47606

X = peekThroughBitcasts(X);

47607

Y = peekThroughBitcasts(Y);

47608

47609

EVT MaskVT = Mask.getValueType();

47610

unsigned EltBits = MaskVT.getScalarSizeInBits();

47611

47612

// TODO: Attempt to handle floating point cases as well?

47613

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

47614

return SDValue();

47615

47616

SDLoc DL(N);

47617

47618

// Attempt to combine to conditional negate: (sub (xor X, M), M)

47619

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

47620

DAG, Subtarget))

47621

return Res;

47622

47623

// PBLENDVB is only available on SSE 4.1.

47624

if (!Subtarget.hasSSE41())

47625

return SDValue();

47626

47627

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

47628

if (Subtarget.hasVLX())

47629

return SDValue();

47630

47631

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

47632

47633

X = DAG.getBitcast(BlendVT, X);

47634

Y = DAG.getBitcast(BlendVT, Y);

47635

Mask = DAG.getBitcast(BlendVT, Mask);

47636

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

47637

return DAG.getBitcast(VT, Mask);

47638

}

47639

47640

// Helper function for combineOrCmpEqZeroToCtlzSrl

47641

// Transforms:

47642

// seteq(cmp x, 0)

47643

// into:

47644

// srl(ctlz x), log2(bitsize(x))

47645

// Input pattern is checked by caller.

47646

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

47647

SDValue Cmp = Op.getOperand(1);

47648

EVT VT = Cmp.getOperand(0).getValueType();

47649

unsigned Log2b = Log2_32(VT.getSizeInBits());

47650

SDLoc dl(Op);

47651

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

47652

// The result of the shift is true or false, and on X86, the 32-bit

47653

// encoding of shr and lzcnt is more desirable.

47654

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

47655

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

47656

DAG.getConstant(Log2b, dl, MVT::i8));

47657

return Scc;

47658

}

47659

47660

// Try to transform:

47661

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

47662

// into:

47663

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

47664

// Will also attempt to match more generic cases, eg:

47665

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

47666

// Only applies if the target supports the FastLZCNT feature.

47667

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

47668

TargetLowering::DAGCombinerInfo &DCI,

47669

const X86Subtarget &Subtarget) {

47670

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

47671

return SDValue();

47672

47673

auto isORCandidate = [](SDValue N) {

47674

return (N->getOpcode() == ISD::OR && N->hasOneUse());

47675

};

47676

47677

// Check the zero extend is extending to 32-bit or more. The code generated by

47678

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

47679

// instructions to clear the upper bits.

47680

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

47681

!isORCandidate(N->getOperand(0)))

47682

return SDValue();

47683

47684

// Check the node matches: setcc(eq, cmp 0)

47685

auto isSetCCCandidate = [](SDValue N) {

47686

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

47687

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

47688

N->getOperand(1).getOpcode() == X86ISD::CMP &&

47689

isNullConstant(N->getOperand(1).getOperand(1)) &&

47690

N->getOperand(1).getValueType().bitsGE(MVT::i32);

47691

};

47692

47693

SDNode *OR = N->getOperand(0).getNode();

47694

SDValue LHS = OR->getOperand(0);

47695

SDValue RHS = OR->getOperand(1);

47696

47697

// Save nodes matching or(or, setcc(eq, cmp 0)).

47698

SmallVector<SDNode *, 2> ORNodes;

47699

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

47700

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

47701

ORNodes.push_back(OR);

47702

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

47703

LHS = OR->getOperand(0);

47704

RHS = OR->getOperand(1);

47705

}

47706

47707

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

47708

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

47709

!isORCandidate(SDValue(OR, 0)))

47710

return SDValue();

47711

47712

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

47713

// to

47714

// or(srl(ctlz),srl(ctlz)).

47715

// The dag combiner can then fold it into:

47716

// srl(or(ctlz, ctlz)).

47717

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

47718

SDValue Ret, NewRHS;

47719

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

47720

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);

47721

47722

if (!Ret)

47723

return SDValue();

47724

47725

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

47726

while (ORNodes.size() > 0) {

47727

OR = ORNodes.pop_back_val();

47728

LHS = OR->getOperand(0);

47729

RHS = OR->getOperand(1);

47730

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

47731

if (RHS->getOpcode() == ISD::OR)

47732

std::swap(LHS, RHS);

47733

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

47734

if (!NewRHS)

47735

return SDValue();

47736

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

47737

}

47738

47739

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

47740

}

47741

47742

static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,

47743

SDValue And1_L, SDValue And1_R, SDLoc DL,

47744

SelectionDAG &DAG) {

47745

if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())

47746

return SDValue();

47747

SDValue NotOp = And0_L->getOperand(0);

47748

if (NotOp == And1_R)

47749

std::swap(And1_R, And1_L);

47750

if (NotOp != And1_L)

47751

return SDValue();

47752

47753

// (~(NotOp) & And0_R) | (NotOp & And1_R)

47754

// --> ((And0_R ^ And1_R) & NotOp) ^ And1_R

47755

EVT VT = And1_L->getValueType(0);

47756

SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);

47757

SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);

47758

SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);

47759

SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);

47760

return Xor1;

47761

}

47762

47763

/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the

47764

/// equivalent `((x ^ y) & m) ^ y)` pattern.

47765

/// This is typically a better representation for targets without a fused

47766

/// "and-not" operation. This function is intended to be called from a

47767

/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.

47768

static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {

47769

// Note that masked-merge variants using XOR or ADD expressions are

47770

// normalized to OR by InstCombine so we only check for OR.

47771

assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47771, __extension__
__PRETTY_FUNCTION__));

47772

SDValue N0 = Node->getOperand(0);

47773

if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())

47774

return SDValue();

47775

SDValue N1 = Node->getOperand(1);

47776

if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())

47777

return SDValue();

47778

47779

SDLoc DL(Node);

47780

SDValue N00 = N0->getOperand(0);

47781

SDValue N01 = N0->getOperand(1);

47782

SDValue N10 = N1->getOperand(0);

47783

SDValue N11 = N1->getOperand(1);

47784

if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))

47785

return Result;

47786

if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))

47787

return Result;

47788

if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))

47789

return Result;

47790

if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))

47791

return Result;

47792

return SDValue();

47793

}

47794

47795

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

47796

TargetLowering::DAGCombinerInfo &DCI,

47797

const X86Subtarget &Subtarget) {

47798

SDValue N0 = N->getOperand(0);

47799

SDValue N1 = N->getOperand(1);

47800

EVT VT = N->getValueType(0);

47801

SDLoc dl(N);

47802

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47803

47804

// If this is SSE1 only convert to FOR to avoid scalarization.

47805

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

47806

return DAG.getBitcast(MVT::v4i32,

47807

DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

47808

DAG.getBitcast(MVT::v4f32, N0),

47809

DAG.getBitcast(MVT::v4f32, N1)));

47810

}

47811

47812

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

47813

// TODO: Support multiple SrcOps.

47814

if (VT == MVT::i1) {

47815

SmallVector<SDValue, 2> SrcOps;

47816

SmallVector<APInt, 2> SrcPartials;

47817

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

47818

SrcOps.size() == 1) {

47819

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

47820

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

47821

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

47822

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

47823

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

47824

if (Mask) {

47825

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47826, __extension__
__PRETTY_FUNCTION__))

47826

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47826, __extension__
__PRETTY_FUNCTION__));

47827

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

47828

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

47829

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

47830

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

47831

}

47832

}

47833

}

47834

47835

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

47836

return R;

47837

47838

if (SDValue R = combineBitOpWithShift(N, DAG))

47839

return R;

47840

47841

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

47842

return FPLogic;

47843

47844

if (DCI.isBeforeLegalizeOps())

47845

return SDValue();

47846

47847

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

47848

return R;

47849

47850

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

47851

return R;

47852

47853

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

47854

return R;

47855

47856

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

47857

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

47858

// iff the upper elements of the non-shifted arg are zero.

47859

// KUNPCK require 16+ bool vector elements.

47860

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

47861

unsigned NumElts = VT.getVectorNumElements();

47862

unsigned HalfElts = NumElts / 2;

47863

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

47864

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

47865

N1.getConstantOperandAPInt(1) == HalfElts &&

47866

DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {

47867

return DAG.getNode(

47868

ISD::CONCAT_VECTORS, dl, VT,

47869

extractSubVector(N0, 0, DAG, dl, HalfElts),

47870

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

47871

}

47872

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

47873

N0.getConstantOperandAPInt(1) == HalfElts &&

47874

DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {

47875

return DAG.getNode(

47876

ISD::CONCAT_VECTORS, dl, VT,

47877

extractSubVector(N1, 0, DAG, dl, HalfElts),

47878

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

47879

}

47880

}

47881

47882

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

47883

// Attempt to recursively combine an OR of shuffles.

47884

SDValue Op(N, 0);

47885

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

47886

return Res;

47887

47888

// If either operand is a constant mask, then only the elements that aren't

47889

// allones are actually demanded by the other operand.

47890

auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

47891

APInt UndefElts;

47892

SmallVector<APInt> EltBits;

47893

int NumElts = VT.getVectorNumElements();

47894

int EltSizeInBits = VT.getScalarSizeInBits();

47895

if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

47896

return false;

47897

47898

APInt DemandedElts = APInt::getZero(NumElts);

47899

for (int I = 0; I != NumElts; ++I)

47900

if (!EltBits[I].isAllOnes())

47901

DemandedElts.setBit(I);

47902

47903

return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

47904

};

47905

if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

47906

if (N->getOpcode() != ISD::DELETED_NODE)

47907

DCI.AddToWorklist(N);

47908

return SDValue(N, 0);

47909

}

47910

}

47911

47912

// We should fold "masked merge" patterns when `andn` is not available.

47913

if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)

47914

if (SDValue R = foldMaskedMerge(N, DAG))

47915

return R;

47916

47917

return SDValue();

47918

}

47919

47920

/// Try to turn tests against the signbit in the form of:

47921

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

47922

/// into:

47923

/// SETGT(X, -1)

47924

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

47925

// This is only worth doing if the output type is i8 or i1.

47926

EVT ResultType = N->getValueType(0);

47927

if (ResultType != MVT::i8 && ResultType != MVT::i1)

47928

return SDValue();

47929

47930

SDValue N0 = N->getOperand(0);

47931

SDValue N1 = N->getOperand(1);

47932

47933

// We should be performing an xor against a truncated shift.

47934

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

47935

return SDValue();

47936

47937

// Make sure we are performing an xor against one.

47938

if (!isOneConstant(N1))

47939

return SDValue();

47940

47941

// SetCC on x86 zero extends so only act on this if it's a logical shift.

47942

SDValue Shift = N0.getOperand(0);

47943

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

47944

return SDValue();

47945

47946

// Make sure we are truncating from one of i16, i32 or i64.

47947

EVT ShiftTy = Shift.getValueType();

47948

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

47949

return SDValue();

47950

47951

// Make sure the shift amount extracts the sign bit.

47952

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

47953

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

47954

return SDValue();

47955

47956

// Create a greater-than comparison against -1.

47957

// N.B. Using SETGE against 0 works but we want a canonical looking

47958

// comparison, using SETGT matches up with what TranslateX86CC.

47959

SDLoc DL(N);

47960

SDValue ShiftOp = Shift.getOperand(0);

47961

EVT ShiftOpTy = ShiftOp.getValueType();

47962

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47963

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

47964

*DAG.getContext(), ResultType);

47965

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

47966

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

47967

if (SetCCResultType != ResultType)

47968

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

47969

return Cond;

47970

}

47971

47972

/// Turn vector tests of the signbit in the form of:

47973

/// xor (sra X, elt_size(X)-1), -1

47974

/// into:

47975

/// pcmpgt X, -1

47976

///

47977

/// This should be called before type legalization because the pattern may not

47978

/// persist after that.

47979

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

47980

const X86Subtarget &Subtarget) {

47981

EVT VT = N->getValueType(0);

47982

if (!VT.isSimple())

47983

return SDValue();

47984

47985

switch (VT.getSimpleVT().SimpleTy) {

47986

default: return SDValue();

47987

case MVT::v16i8:

47988

case MVT::v8i16:

47989

case MVT::v4i32:

47990

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

47991

case MVT::v32i8:

47992

case MVT::v16i16:

47993

case MVT::v8i32:

47994

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

47995

}

47996

47997

// There must be a shift right algebraic before the xor, and the xor must be a

47998

// 'not' operation.

47999

SDValue Shift = N->getOperand(0);

48000

SDValue Ones = N->getOperand(1);

48001

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

48002

!ISD::isBuildVectorAllOnes(Ones.getNode()))

48003

return SDValue();

48004

48005

// The shift should be smearing the sign bit across each vector element.

48006

auto *ShiftAmt =

48007

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

48008

if (!ShiftAmt ||

48009

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

48010

return SDValue();

48011

48012

// Create a greater-than comparison against -1. We don't use the more obvious

48013

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

48014

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

48015

}

48016

48017

/// Detect patterns of truncation with unsigned saturation:

48018

///

48019

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

48020

/// Return the source value x to be truncated or SDValue() if the pattern was

48021

/// not matched.

48022

///

48023

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

48024

/// where C1 >= 0 and C2 is unsigned max of destination type.

48025

///

48026

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

48027

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

48028

///

48029

/// These two patterns are equivalent to:

48030

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

48031

/// So return the smax(x, C1) value to be truncated or SDValue() if the

48032

/// pattern was not matched.

48033

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

48034

const SDLoc &DL) {

48035

EVT InVT = In.getValueType();

48036

48037

// Saturation with truncation. We truncate from InVT to VT.

48038

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48039, __extension__
__PRETTY_FUNCTION__))

48039

"Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48039, __extension__
__PRETTY_FUNCTION__));

48040

48041

// Match min/max and return limit value as a parameter.

48042

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

48043

if (V.getOpcode() == Opcode &&

48044

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

48045

return V.getOperand(0);

48046

return SDValue();

48047

};

48048

48049

APInt C1, C2;

48050

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

48051

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

48052

// the element size of the destination type.

48053

if (C2.isMask(VT.getScalarSizeInBits()))

48054

return UMin;

48055

48056

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

48057

if (MatchMinMax(SMin, ISD::SMAX, C1))

48058

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

48059

return SMin;

48060

48061

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

48062

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

48063

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

48064

C2.uge(C1)) {

48065

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

48066

}

48067

48068

return SDValue();

48069

}

48070

48071

/// Detect patterns of truncation with signed saturation:

48072

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

48073

/// signed_max_of_dest_type)) to dest_type)

48074

/// or:

48075

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

48076

/// signed_min_of_dest_type)) to dest_type).

48077

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

48078

/// Return the source value to be truncated or SDValue() if the pattern was not

48079

/// matched.

48080

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

48081

unsigned NumDstBits = VT.getScalarSizeInBits();

48082

unsigned NumSrcBits = In.getScalarValueSizeInBits();

48083

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48083, __extension__
__PRETTY_FUNCTION__));

48084

48085

auto MatchMinMax = [](SDValue V, unsigned Opcode,

48086

const APInt &Limit) -> SDValue {

48087

APInt C;

48088

if (V.getOpcode() == Opcode &&

48089

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

48090

return V.getOperand(0);

48091

return SDValue();

48092

};

48093

48094

APInt SignedMax, SignedMin;

48095

if (MatchPackUS) {

48096

SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

48097

SignedMin = APInt(NumSrcBits, 0);

48098

} else {

48099

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

48100

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

48101

}

48102

48103

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

48104

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

48105

return SMax;

48106

48107

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

48108

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

48109

return SMin;

48110

48111

return SDValue();

48112

}

48113

48114

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

48115

SelectionDAG &DAG,

48116

const X86Subtarget &Subtarget) {

48117

if (!Subtarget.hasSSE2() || !VT.isVector())

48118

return SDValue();

48119

48120

EVT SVT = VT.getVectorElementType();

48121

EVT InVT = In.getValueType();

48122

EVT InSVT = InVT.getVectorElementType();

48123

48124

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

48125

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

48126

// and concatenate at the same time. Then we can use a final vpmovuswb to

48127

// clip to 0-255.

48128

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

48129

InVT == MVT::v16i32 && VT == MVT::v16i8) {

48130

if (auto USatVal = detectSSatPattern(In, VT, true)) {

48131

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

48132

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

48133

DL, DAG, Subtarget);

48134

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48134, __extension__
__PRETTY_FUNCTION__));

48135

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

48136

}

48137

}

48138

48139

// vXi32 truncate instructions are available with AVX512F.

48140

// vXi16 truncate instructions are only available with AVX512BW.

48141

// For 256-bit or smaller vectors, we require VLX.

48142

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

48143

// If the result type is 256-bits or larger and we have disable 512-bit

48144

// registers, we should go ahead and use the pack instructions if possible.

48145

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

48146

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

48147

(InVT.getSizeInBits() > 128) &&

48148

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

48149

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

48150

48151

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

48152

VT.getSizeInBits() >= 64 &&

48153

(SVT == MVT::i8 || SVT == MVT::i16) &&

48154

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

48155

if (auto USatVal = detectSSatPattern(In, VT, true)) {

48156

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

48157

// Only do this when the result is at least 64 bits or we'll leaving

48158

// dangling PACKSSDW nodes.

48159

if (SVT == MVT::i8 && InSVT == MVT::i32) {

48160

EVT MidVT = VT.changeVectorElementType(MVT::i16);

48161

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

48162

DAG, Subtarget);

48163

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48163, __extension__
__PRETTY_FUNCTION__));

48164

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

48165

Subtarget);

48166

assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48166, __extension__ __PRETTY_FUNCTION__));

48167

return V;

48168

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

48169

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

48170

Subtarget);

48171

}

48172

if (auto SSatVal = detectSSatPattern(In, VT))

48173

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

48174

Subtarget);

48175

}

48176

48177

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48178

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

48179

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

48180

(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

48181

unsigned TruncOpc = 0;

48182

SDValue SatVal;

48183

if (auto SSatVal = detectSSatPattern(In, VT)) {

48184

SatVal = SSatVal;

48185

TruncOpc = X86ISD::VTRUNCS;

48186

} else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {

48187

SatVal = USatVal;

48188

TruncOpc = X86ISD::VTRUNCUS;

48189

}

48190

if (SatVal) {

48191

unsigned ResElts = VT.getVectorNumElements();

48192

// If the input type is less than 512 bits and we don't have VLX, we need

48193

// to widen to 512 bits.

48194

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

48195

unsigned NumConcats = 512 / InVT.getSizeInBits();

48196

ResElts *= NumConcats;

48197

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

48198

ConcatOps[0] = SatVal;

48199

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

48200

NumConcats * InVT.getVectorNumElements());

48201

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

48202

}

48203

// Widen the result if its narrower than 128 bits.

48204

if (ResElts * SVT.getSizeInBits() < 128)

48205

ResElts = 128 / SVT.getSizeInBits();

48206

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

48207

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

48208

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

48209

DAG.getIntPtrConstant(0, DL));

48210

}

48211

}

48212

48213

return SDValue();

48214

}

48215

48216

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

48217

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

48218

/// ISD::AVGCEILU (AVG) instruction.

48219

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

48220

const X86Subtarget &Subtarget,

48221

const SDLoc &DL) {

48222

if (!VT.isVector())

48223

return SDValue();

48224

EVT InVT = In.getValueType();

48225

unsigned NumElems = VT.getVectorNumElements();

48226

48227

EVT ScalarVT = VT.getVectorElementType();

48228

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

48229

return SDValue();

48230

48231

// InScalarVT is the intermediate type in AVG pattern and it should be greater

48232

// than the original input type (i8/i16).

48233

EVT InScalarVT = InVT.getVectorElementType();

48234

if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())

48235

return SDValue();

48236

48237

if (!Subtarget.hasSSE2())

48238

return SDValue();

48239

48240

// Detect the following pattern:

48241

//

48242

// %1 = zext <N x i8> %a to <N x i32>

48243

// %2 = zext <N x i8> %b to <N x i32>

48244

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

48245

// %4 = add nuw nsw <N x i32> %3, %2

48246

// %5 = lshr <N x i32> %N, <i32 1 x N>

48247

// %6 = trunc <N x i32> %5 to <N x i8>

48248

//

48249

// In AVX512, the last instruction can also be a trunc store.

48250

if (In.getOpcode() != ISD::SRL)

48251

return SDValue();

48252

48253

// A lambda checking the given SDValue is a constant vector and each element

48254

// is in the range [Min, Max].

48255

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

48256

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

48257

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

48258

});

48259

};

48260

48261

auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {

48262

unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();

48263

return MaxActiveBits <= ScalarVT.getSizeInBits();

48264

};

48265

48266

// Check if each element of the vector is right-shifted by one.

48267

SDValue LHS = In.getOperand(0);

48268

SDValue RHS = In.getOperand(1);

48269

if (!IsConstVectorInRange(RHS, 1, 1))

48270

return SDValue();

48271

if (LHS.getOpcode() != ISD::ADD)

48272

return SDValue();

48273

48274

// Detect a pattern of a + b + 1 where the order doesn't matter.

48275

SDValue Operands[3];

48276

Operands[0] = LHS.getOperand(0);

48277

Operands[1] = LHS.getOperand(1);

48278

48279

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48280

ArrayRef<SDValue> Ops) {

48281

return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);

48282

};

48283

48284

auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

48285

for (SDValue &Op : Ops)

48286

if (Op.getValueType() != VT)

48287

Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

48288

// Pad to a power-of-2 vector, split+apply and extract the original vector.

48289

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

48290

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

48291

if (NumElemsPow2 != NumElems) {

48292

for (SDValue &Op : Ops) {

48293

SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));

48294

for (unsigned i = 0; i != NumElems; ++i) {

48295

SDValue Idx = DAG.getIntPtrConstant(i, DL);

48296

EltsOfOp[i] =

48297

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);

48298

}

48299

Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);

48300

}

48301

}

48302

SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);

48303

if (NumElemsPow2 == NumElems)

48304

return Res;

48305

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

48306

DAG.getIntPtrConstant(0, DL));

48307

};

48308

48309

// Take care of the case when one of the operands is a constant vector whose

48310

// element is in the range [1, 256].

48311

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

48312

IsZExtLike(Operands[0])) {

48313

// The pattern is detected. Subtract one from the constant vector, then

48314

// demote it and emit X86ISD::AVG instruction.

48315

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

48316

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

48317

return AVGSplitter({Operands[0], Operands[1]});

48318

}

48319

48320

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

48321

// Match the or case only if its 'add-like' - can be replaced by an add.

48322

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

48323

if (ISD::ADD == V.getOpcode()) {

48324

Op0 = V.getOperand(0);

48325

Op1 = V.getOperand(1);

48326

return true;

48327

}

48328

if (ISD::ZERO_EXTEND != V.getOpcode())

48329

return false;

48330

V = V.getOperand(0);

48331

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

48332

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

48333

return false;

48334

Op0 = V.getOperand(0);

48335

Op1 = V.getOperand(1);

48336

return true;

48337

};

48338

48339

SDValue Op0, Op1;

48340

if (FindAddLike(Operands[0], Op0, Op1))

48341

std::swap(Operands[0], Operands[1]);

48342

else if (!FindAddLike(Operands[1], Op0, Op1))

48343

return SDValue();

48344

Operands[2] = Op0;

48345

Operands[1] = Op1;

48346

48347

// Now we have three operands of two additions. Check that one of them is a

48348

// constant vector with ones, and the other two can be promoted from i8/i16.

48349

for (int i = 0; i < 3; ++i) {

48350

if (!IsConstVectorInRange(Operands[i], 1, 1))

48351

continue;

48352

std::swap(Operands[i], Operands[2]);

48353

48354

// Check if Operands[0] and Operands[1] are results of type promotion.

48355

for (int j = 0; j < 2; ++j)

48356

if (Operands[j].getValueType() != VT)

48357

if (!IsZExtLike(Operands[j]))

48358

return SDValue();

48359

48360

// The pattern is detected, emit X86ISD::AVG instruction(s).

48361

return AVGSplitter({Operands[0], Operands[1]});

48362

}

48363

48364

return SDValue();

48365

}

48366

48367

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

48368

TargetLowering::DAGCombinerInfo &DCI,

48369

const X86Subtarget &Subtarget) {

48370

LoadSDNode *Ld = cast<LoadSDNode>(N);

48371

EVT RegVT = Ld->getValueType(0);

48372

EVT MemVT = Ld->getMemoryVT();

48373

SDLoc dl(Ld);

48374

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48375

48376

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

48377

// into two 16-byte operations. Also split non-temporal aligned loads on

48378

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

48379

ISD::LoadExtType Ext = Ld->getExtensionType();

48380

bool Fast;

48381

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

48382

Ext == ISD::NON_EXTLOAD &&

48383

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

48384

Ld->getAlignment() >= 16) ||

48385

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

48386

*Ld->getMemOperand(), &Fast) &&

48387

!Fast))) {

48388

unsigned NumElems = RegVT.getVectorNumElements();

48389

if (NumElems < 2)

48390

return SDValue();

48391

48392

unsigned HalfOffset = 16;

48393

SDValue Ptr1 = Ld->getBasePtr();

48394

SDValue Ptr2 =

48395

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

48396

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

48397

NumElems / 2);

48398

SDValue Load1 =

48399

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

48400

Ld->getOriginalAlign(),

48401

Ld->getMemOperand()->getFlags());

48402

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

48403

Ld->getPointerInfo().getWithOffset(HalfOffset),

48404

Ld->getOriginalAlign(),

48405

Ld->getMemOperand()->getFlags());

48406

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

48407

Load1.getValue(1), Load2.getValue(1));

48408

48409

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

48410

return DCI.CombineTo(N, NewVec, TF, true);

48411

}

48412

48413

// Bool vector load - attempt to cast to an integer, as we have good

48414

// (vXiY *ext(vXi1 bitcast(iX))) handling.

48415

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

48416

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

48417

unsigned NumElts = RegVT.getVectorNumElements();

48418

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

48419

if (TLI.isTypeLegal(IntVT)) {

48420

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

48421

Ld->getPointerInfo(),

48422

Ld->getOriginalAlign(),

48423

Ld->getMemOperand()->getFlags());

48424

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

48425

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

48426

}

48427

}

48428

48429

// If we also broadcast this as a subvector to a wider type, then just extract

48430

// the lowest subvector.

48431

if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

48432

(RegVT.is128BitVector() || RegVT.is256BitVector())) {

48433

SDValue Ptr = Ld->getBasePtr();

48434

SDValue Chain = Ld->getChain();

48435

for (SDNode *User : Ptr->uses()) {

48436

if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

48437

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

48438

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

48439

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

48440

MemVT.getSizeInBits() &&

48441

!User->hasAnyUseOfValue(1) &&

48442

User->getValueSizeInBits(0).getFixedSize() >

48443

RegVT.getFixedSizeInBits()) {

48444

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

48445

RegVT.getSizeInBits());

48446

Extract = DAG.getBitcast(RegVT, Extract);

48447

return DCI.CombineTo(N, Extract, SDValue(User, 1));

48448

}

48449

}

48450

}

48451

48452

// Cast ptr32 and ptr64 pointers to the default address space before a load.

48453

unsigned AddrSpace = Ld->getAddressSpace();

48454

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

48455

AddrSpace == X86AS::PTR32_UPTR) {

48456

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

48457

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

48458

SDValue Cast =

48459

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

48460

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

48461

Ld->getOriginalAlign(),

48462

Ld->getMemOperand()->getFlags());

48463

}

48464

}

48465

48466

return SDValue();

48467

}

48468

48469

/// If V is a build vector of boolean constants and exactly one of those

48470

/// constants is true, return the operand index of that true element.

48471

/// Otherwise, return -1.

48472

static int getOneTrueElt(SDValue V) {

48473

// This needs to be a build vector of booleans.

48474

// TODO: Checking for the i1 type matches the IR definition for the mask,

48475

// but the mask check could be loosened to i8 or other types. That might

48476

// also require checking more than 'allOnesValue'; eg, the x86 HW

48477

// instructions only require that the MSB is set for each mask element.

48478

// The ISD::MSTORE comments/definition do not specify how the mask operand

48479

// is formatted.

48480

auto *BV = dyn_cast<BuildVectorSDNode>(V);

48481

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

48482

return -1;

48483

48484

int TrueIndex = -1;

48485

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

48486

for (unsigned i = 0; i < NumElts; ++i) {

48487

const SDValue &Op = BV->getOperand(i);

48488

if (Op.isUndef())

48489

continue;

48490

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

48491

if (!ConstNode)

48492

return -1;

48493

if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {

48494

// If we already found a one, this is too many.

48495

if (TrueIndex >= 0)

48496

return -1;

48497

TrueIndex = i;

48498

}

48499

}

48500

return TrueIndex;

48501

}

48502

48503

/// Given a masked memory load/store operation, return true if it has one mask

48504

/// bit set. If it has one mask bit set, then also return the memory address of

48505

/// the scalar element to load/store, the vector index to insert/extract that

48506

/// scalar element, and the alignment for the scalar memory access.

48507

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

48508

SelectionDAG &DAG, SDValue &Addr,

48509

SDValue &Index, Align &Alignment,

48510

unsigned &Offset) {

48511

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

48512

if (TrueMaskElt < 0)

48513

return false;

48514

48515

// Get the address of the one scalar element that is specified by the mask

48516

// using the appropriate offset from the base pointer.

48517

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

48518

Offset = 0;

48519

Addr = MaskedOp->getBasePtr();

48520

if (TrueMaskElt != 0) {

48521

Offset = TrueMaskElt * EltVT.getStoreSize();

48522

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

48523

SDLoc(MaskedOp));

48524

}

48525

48526

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

48527

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

48528

EltVT.getStoreSize());

48529

return true;

48530

}

48531

48532

/// If exactly one element of the mask is set for a non-extending masked load,

48533

/// it is a scalar load and vector insert.

48534

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

48535

/// mask have already been optimized in IR, so we don't bother with those here.

48536

static SDValue

48537

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

48538

TargetLowering::DAGCombinerInfo &DCI,

48539

const X86Subtarget &Subtarget) {

48540

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48540, __extension__
__PRETTY_FUNCTION__));

48541

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

48542

// However, some target hooks may need to be added to know when the transform

48543

// is profitable. Endianness would also have to be considered.

48544

48545

SDValue Addr, VecIndex;

48546

Align Alignment;

48547

unsigned Offset;

48548

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

48549

return SDValue();

48550

48551

// Load the one scalar element that is specified by the mask using the

48552

// appropriate offset from the base pointer.

48553

SDLoc DL(ML);

48554

EVT VT = ML->getValueType(0);

48555

EVT EltVT = VT.getVectorElementType();

48556

48557

EVT CastVT = VT;

48558

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

48559

EltVT = MVT::f64;

48560

CastVT = VT.changeVectorElementType(EltVT);

48561

}

48562

48563

SDValue Load =

48564

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

48565

ML->getPointerInfo().getWithOffset(Offset),

48566

Alignment, ML->getMemOperand()->getFlags());

48567

48568

SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());

48569

48570

// Insert the loaded element into the appropriate place in the vector.

48571

SDValue Insert =

48572

DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

48573

Insert = DAG.getBitcast(VT, Insert);

48574

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

48575

}

48576

48577

static SDValue

48578

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

48579

TargetLowering::DAGCombinerInfo &DCI) {

48580

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48580, __extension__
__PRETTY_FUNCTION__));

48581

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

48582

return SDValue();

48583

48584

SDLoc DL(ML);

48585

EVT VT = ML->getValueType(0);

48586

48587

// If we are loading the first and last elements of a vector, it is safe and

48588

// always faster to load the whole vector. Replace the masked load with a

48589

// vector load and select.

48590

unsigned NumElts = VT.getVectorNumElements();

48591

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

48592

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

48593

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

48594

if (LoadFirstElt && LoadLastElt) {

48595

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

48596

ML->getMemOperand());

48597

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

48598

ML->getPassThru());

48599

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

48600

}

48601

48602

// Convert a masked load with a constant mask into a masked load and a select.

48603

// This allows the select operation to use a faster kind of select instruction

48604

// (for example, vblendvps -> vblendps).

48605

48606

// Don't try this if the pass-through operand is already undefined. That would

48607

// cause an infinite loop because that's what we're about to create.

48608

if (ML->getPassThru().isUndef())

48609

return SDValue();

48610

48611

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

48612

return SDValue();

48613

48614

// The new masked load has an undef pass-through operand. The select uses the

48615

// original pass-through operand.

48616

SDValue NewML = DAG.getMaskedLoad(

48617

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

48618

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

48619

ML->getAddressingMode(), ML->getExtensionType());

48620

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

48621

ML->getPassThru());

48622

48623

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

48624

}

48625

48626

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

48627

TargetLowering::DAGCombinerInfo &DCI,

48628

const X86Subtarget &Subtarget) {

48629

auto *Mld = cast<MaskedLoadSDNode>(N);

48630

48631

// TODO: Expanding load with constant mask may be optimized as well.

48632

if (Mld->isExpandingLoad())

48633

return SDValue();

48634

48635

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

48636

if (SDValue ScalarLoad =

48637

reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

48638

return ScalarLoad;

48639

48640

// TODO: Do some AVX512 subsets benefit from this transform?

48641

if (!Subtarget.hasAVX512())

48642

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

48643

return Blend;

48644

}

48645

48646

// If the mask value has been legalized to a non-boolean vector, try to

48647

// simplify ops leading up to it. We only demand the MSB of each lane.

48648

SDValue Mask = Mld->getMask();

48649

if (Mask.getScalarValueSizeInBits() != 1) {

48650

EVT VT = Mld->getValueType(0);

48651

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48652

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

48653

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

48654

if (N->getOpcode() != ISD::DELETED_NODE)

48655

DCI.AddToWorklist(N);

48656

return SDValue(N, 0);

48657

}

48658

if (SDValue NewMask =

48659

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

48660

return DAG.getMaskedLoad(

48661

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

48662

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

48663

Mld->getAddressingMode(), Mld->getExtensionType());

48664

}

48665

48666

return SDValue();

48667

}

48668

48669

/// If exactly one element of the mask is set for a non-truncating masked store,

48670

/// it is a vector extract and scalar store.

48671

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

48672

/// mask have already been optimized in IR, so we don't bother with those here.

48673

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

48674

SelectionDAG &DAG,

48675

const X86Subtarget &Subtarget) {

48676

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

48677

// However, some target hooks may need to be added to know when the transform

48678

// is profitable. Endianness would also have to be considered.

48679

48680

SDValue Addr, VecIndex;

48681

Align Alignment;

48682

unsigned Offset;

48683

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

48684

return SDValue();

48685

48686

// Extract the one scalar element that is actually being stored.

48687

SDLoc DL(MS);

48688

SDValue Value = MS->getValue();

48689

EVT VT = Value.getValueType();

48690

EVT EltVT = VT.getVectorElementType();

48691

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

48692

EltVT = MVT::f64;

48693

EVT CastVT = VT.changeVectorElementType(EltVT);

48694

Value = DAG.getBitcast(CastVT, Value);

48695

}

48696

SDValue Extract =

48697

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);

48698

48699

// Store that element at the appropriate offset from the base pointer.

48700

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

48701

MS->getPointerInfo().getWithOffset(Offset),

48702

Alignment, MS->getMemOperand()->getFlags());

48703

}

48704

48705

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

48706

TargetLowering::DAGCombinerInfo &DCI,

48707

const X86Subtarget &Subtarget) {

48708

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

48709

if (Mst->isCompressingStore())

48710

return SDValue();

48711

48712

EVT VT = Mst->getValue().getValueType();

48713

SDLoc dl(Mst);

48714

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48715

48716

if (Mst->isTruncatingStore())

48717

return SDValue();

48718

48719

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

48720

return ScalarStore;

48721

48722

// If the mask value has been legalized to a non-boolean vector, try to

48723

// simplify ops leading up to it. We only demand the MSB of each lane.

48724

SDValue Mask = Mst->getMask();

48725

if (Mask.getScalarValueSizeInBits() != 1) {

48726

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

48727

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

48728

if (N->getOpcode() != ISD::DELETED_NODE)

48729

DCI.AddToWorklist(N);

48730

return SDValue(N, 0);

48731

}

48732

if (SDValue NewMask =

48733

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

48734

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

48735

Mst->getBasePtr(), Mst->getOffset(), NewMask,

48736

Mst->getMemoryVT(), Mst->getMemOperand(),

48737

Mst->getAddressingMode());

48738

}

48739

48740

SDValue Value = Mst->getValue();

48741

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

48742

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

48743

Mst->getMemoryVT())) {

48744

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

48745

Mst->getBasePtr(), Mst->getOffset(), Mask,

48746

Mst->getMemoryVT(), Mst->getMemOperand(),

48747

Mst->getAddressingMode(), true);

48748

}

48749

48750

return SDValue();

48751

}

48752

48753

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

48754

TargetLowering::DAGCombinerInfo &DCI,

48755

const X86Subtarget &Subtarget) {

48756

StoreSDNode *St = cast<StoreSDNode>(N);

48757

EVT StVT = St->getMemoryVT();

48758

SDLoc dl(St);

48759

SDValue StoredVal = St->getValue();

48760

EVT VT = StoredVal.getValueType();

48761

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48762

48763

// Convert a store of vXi1 into a store of iX and a bitcast.

48764

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

48765

VT.getVectorElementType() == MVT::i1) {

48766

48767

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

48768

StoredVal = DAG.getBitcast(NewVT, StoredVal);

48769

48770

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

48771

St->getPointerInfo(), St->getOriginalAlign(),

48772

St->getMemOperand()->getFlags());

48773

}

48774

48775

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

48776

// This will avoid a copy to k-register.

48777

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

48778

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

48779

StoredVal.getOperand(0).getValueType() == MVT::i8) {

48780

SDValue Val = StoredVal.getOperand(0);

48781

// We must store zeros to the unused bits.

48782

Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

48783

return DAG.getStore(St->getChain(), dl, Val,

48784

St->getBasePtr(), St->getPointerInfo(),

48785

St->getOriginalAlign(),

48786

St->getMemOperand()->getFlags());

48787

}

48788

48789

// Widen v2i1/v4i1 stores to v8i1.

48790

if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

48791

Subtarget.hasAVX512()) {

48792

unsigned NumConcats = 8 / VT.getVectorNumElements();

48793

// We must store zeros to the unused bits.

48794

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

48795

Ops[0] = StoredVal;

48796

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

48797

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

48798

St->getPointerInfo(), St->getOriginalAlign(),

48799

St->getMemOperand()->getFlags());

48800

}

48801

48802

// Turn vXi1 stores of constants into a scalar store.

48803

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

48804

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

48805

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

48806

// If its a v64i1 store without 64-bit support, we need two stores.

48807

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

48808

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

48809

StoredVal->ops().slice(0, 32));

48810

Lo = combinevXi1ConstantToInteger(Lo, DAG);

48811

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

48812

StoredVal->ops().slice(32, 32));

48813

Hi = combinevXi1ConstantToInteger(Hi, DAG);

48814

48815

SDValue Ptr0 = St->getBasePtr();

48816

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

48817

48818

SDValue Ch0 =

48819

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

48820

St->getOriginalAlign(),

48821

St->getMemOperand()->getFlags());

48822

SDValue Ch1 =

48823

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

48824

St->getPointerInfo().getWithOffset(4),

48825

St->getOriginalAlign(),

48826

St->getMemOperand()->getFlags());

48827

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

48828

}

48829

48830

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

48831

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

48832

St->getPointerInfo(), St->getOriginalAlign(),

48833

St->getMemOperand()->getFlags());

48834

}

48835

48836

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

48837

// Sandy Bridge, perform two 16-byte stores.

48838

bool Fast;

48839

if (VT.is256BitVector() && StVT == VT &&

48840

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

48841

*St->getMemOperand(), &Fast) &&

48842

!Fast) {

48843

unsigned NumElems = VT.getVectorNumElements();

48844

if (NumElems < 2)

48845

return SDValue();

48846

48847

return splitVectorStore(St, DAG);

48848

}

48849

48850

// Split under-aligned vector non-temporal stores.

48851

if (St->isNonTemporal() && StVT == VT &&

48852

St->getAlignment() < VT.getStoreSize()) {

48853

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

48854

// vectors or the legalizer can scalarize it to use MOVNTI.

48855

if (VT.is256BitVector() || VT.is512BitVector()) {

48856

unsigned NumElems = VT.getVectorNumElements();

48857

if (NumElems < 2)

48858

return SDValue();

48859

return splitVectorStore(St, DAG);

48860

}

48861

48862

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

48863

// to use MOVNTI.

48864

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

48865

MVT NTVT = Subtarget.hasSSE4A()

48866

? MVT::v2f64

48867

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

48868

return scalarizeVectorStore(St, NTVT, DAG);

48869

}

48870

}

48871

48872

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

48873

// supported, but avx512f is by extending to v16i32 and truncating.

48874

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

48875

St->getValue().getOpcode() == ISD::TRUNCATE &&

48876

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

48877

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

48878

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

48879

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

48880

St->getValue().getOperand(0));

48881

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

48882

MVT::v16i8, St->getMemOperand());

48883

}

48884

48885

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

48886

if (!St->isTruncatingStore() &&

48887

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

48888

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

48889

StoredVal.hasOneUse() &&

48890

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

48891

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

48892

return EmitTruncSStore(IsSigned, St->getChain(),

48893

dl, StoredVal.getOperand(0), St->getBasePtr(),

48894

VT, St->getMemOperand(), DAG);

48895

}

48896

48897

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

48898

if (!St->isTruncatingStore()) {

48899

auto IsExtractedElement = [](SDValue V) {

48900

if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

48901

V = V.getOperand(0);

48902

unsigned Opc = V.getOpcode();

48903

if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

48904

isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

48905

V.getOperand(0).hasOneUse())

48906

return V.getOperand(0);

48907

return SDValue();

48908

};

48909

if (SDValue Extract = IsExtractedElement(StoredVal)) {

48910

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

48911

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

48912

SDValue Src = Trunc.getOperand(0);

48913

MVT DstVT = Trunc.getSimpleValueType();

48914

MVT SrcVT = Src.getSimpleValueType();

48915

unsigned NumSrcElts = SrcVT.getVectorNumElements();

48916

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

48917

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

48918

if (NumTruncBits == VT.getSizeInBits() &&

48919

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

48920

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

48921

TruncVT, St->getMemOperand());

48922

}

48923

}

48924

}

48925

}

48926

48927

// Optimize trunc store (of multiple scalars) to shuffle and store.

48928

// First, pack all of the elements in one place. Next, store to memory

48929

// in fewer chunks.

48930

if (St->isTruncatingStore() && VT.isVector()) {

48931

// Check if we can detect an AVG pattern from the truncation. If yes,

48932

// replace the trunc store by a normal store with the result of X86ISD::AVG

48933

// instruction.

48934

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

48935

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

48936

Subtarget, dl))

48937

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

48938

St->getPointerInfo(), St->getOriginalAlign(),

48939

St->getMemOperand()->getFlags());

48940

48941

if (TLI.isTruncStoreLegal(VT, StVT)) {

48942

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

48943

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

48944

dl, Val, St->getBasePtr(),

48945

St->getMemoryVT(), St->getMemOperand(), DAG);

48946

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

48947

DAG, dl))

48948

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

48949

dl, Val, St->getBasePtr(),

48950

St->getMemoryVT(), St->getMemOperand(), DAG);

48951

}

48952

48953

return SDValue();

48954

}

48955

48956

// Cast ptr32 and ptr64 pointers to the default address space before a store.

48957

unsigned AddrSpace = St->getAddressSpace();

48958

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

48959

AddrSpace == X86AS::PTR32_UPTR) {

48960

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

48961

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

48962

SDValue Cast =

48963

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

48964

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

48965

St->getPointerInfo(), St->getOriginalAlign(),

48966

St->getMemOperand()->getFlags(), St->getAAInfo());

48967

}

48968

}

48969

48970

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

48971

// the FP state in cases where an emms may be missing.

48972

// A preferable solution to the general problem is to figure out the right

48973

// places to insert EMMS. This qualifies as a quick hack.

48974

48975

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

48976

if (VT.getSizeInBits() != 64)

48977

return SDValue();

48978

48979

const Function &F = DAG.getMachineFunction().getFunction();

48980

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

48981

bool F64IsLegal =

48982

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

48983

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

48984

isa<LoadSDNode>(St->getValue()) &&

48985

cast<LoadSDNode>(St->getValue())->isSimple() &&

48986

St->getChain().hasOneUse() && St->isSimple()) {

48987

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

48988

48989

if (!ISD::isNormalLoad(Ld))

48990

return SDValue();

48991

48992

// Avoid the transformation if there are multiple uses of the loaded value.

48993

if (!Ld->hasNUsesOfValue(1, 0))

48994

return SDValue();

48995

48996

SDLoc LdDL(Ld);

48997

SDLoc StDL(N);

48998

// Lower to a single movq load/store pair.

48999

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

49000

Ld->getBasePtr(), Ld->getMemOperand());

49001

49002

// Make sure new load is placed in same chain order.

49003

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

49004

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

49005

St->getMemOperand());

49006

}

49007

49008

// This is similar to the above case, but here we handle a scalar 64-bit

49009

// integer store that is extracted from a vector on a 32-bit target.

49010

// If we have SSE2, then we can treat it like a floating-point double

49011

// to get past legalization. The execution dependencies fixup pass will

49012

// choose the optimal machine instruction for the store if this really is

49013

// an integer or v2f32 rather than an f64.

49014

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

49015

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

49016

SDValue OldExtract = St->getOperand(1);

49017

SDValue ExtOp0 = OldExtract.getOperand(0);

49018

unsigned VecSize = ExtOp0.getValueSizeInBits();

49019

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

49020

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

49021

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

49022

BitCast, OldExtract.getOperand(1));

49023

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

49024

St->getPointerInfo(), St->getOriginalAlign(),

49025

St->getMemOperand()->getFlags());

49026

}

49027

49028

return SDValue();

49029

}

49030

49031

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

49032

TargetLowering::DAGCombinerInfo &DCI,

49033

const X86Subtarget &Subtarget) {

49034

auto *St = cast<MemIntrinsicSDNode>(N);

49035

49036

SDValue StoredVal = N->getOperand(1);

49037

MVT VT = StoredVal.getSimpleValueType();

49038

EVT MemVT = St->getMemoryVT();

49039

49040

// Figure out which elements we demand.

49041

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

49042

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

49043

49044

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49045

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

49046

if (N->getOpcode() != ISD::DELETED_NODE)

49047

DCI.AddToWorklist(N);

49048

return SDValue(N, 0);

49049

}

49050

49051

return SDValue();

49052

}

49053

49054

/// Return 'true' if this vector operation is "horizontal"

49055

/// and return the operands for the horizontal operation in LHS and RHS. A

49056

/// horizontal operation performs the binary operation on successive elements

49057

/// of its first operand, then on successive elements of its second operand,

49058

/// returning the resulting values in a vector. For example, if

49059

/// A = < float a0, float a1, float a2, float a3 >

49060

/// and

49061

/// B = < float b0, float b1, float b2, float b3 >

49062

/// then the result of doing a horizontal operation on A and B is

49063

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

49064

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

49065

/// A horizontal-op B, for some already available A and B, and if so then LHS is

49066

/// set to A, RHS to B, and the routine returns 'true'.

49067

static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

49068

SelectionDAG &DAG, const X86Subtarget &Subtarget,

49069

bool IsCommutative,

49070

SmallVectorImpl<int> &PostShuffleMask) {

49071

// If either operand is undef, bail out. The binop should be simplified.

49072

if (LHS.isUndef() || RHS.isUndef())

49073

return false;

49074

49075

// Look for the following pattern:

49076

// A = < float a0, float a1, float a2, float a3 >

49077

// B = < float b0, float b1, float b2, float b3 >

49078

// and

49079

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

49080

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

49081

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

49082

// which is A horizontal-op B.

49083

49084

MVT VT = LHS.getSimpleValueType();

49085

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__))

49086

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__
__PRETTY_FUNCTION__));

49087

unsigned NumElts = VT.getVectorNumElements();

49088

49089

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

49090

SmallVectorImpl<int> &ShuffleMask) {

49091

bool UseSubVector = false;

49092

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

49093

Op.getOperand(0).getValueType().is256BitVector() &&

49094

llvm::isNullConstant(Op.getOperand(1))) {

49095

Op = Op.getOperand(0);

49096

UseSubVector = true;

49097

}

49098

SmallVector<SDValue, 2> SrcOps;

49099

SmallVector<int, 16> SrcMask, ScaledMask;

49100

SDValue BC = peekThroughBitcasts(Op);

49101

if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

49102

!isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

49103

return Op.getValueSizeInBits() == BC.getValueSizeInBits();

49104

})) {

49105

resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

49106

if (!UseSubVector && SrcOps.size() <= 2 &&

49107

scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

49108

N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();

49109

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

49110

ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

49111

}

49112

if (UseSubVector && SrcOps.size() == 1 &&

49113

scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

49114

std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

49115

ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

49116

ShuffleMask.assign(Mask.begin(), Mask.end());

49117

}

49118

}

49119

};

49120

49121

// View LHS in the form

49122

// LHS = VECTOR_SHUFFLE A, B, LMask

49123

// If LHS is not a shuffle, then pretend it is the identity shuffle:

49124

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

49125

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

49126

SDValue A, B;

49127

SmallVector<int, 16> LMask;

49128

GetShuffle(LHS, A, B, LMask);

49129

49130

// Likewise, view RHS in the form

49131

// RHS = VECTOR_SHUFFLE C, D, RMask

49132

SDValue C, D;

49133

SmallVector<int, 16> RMask;

49134

GetShuffle(RHS, C, D, RMask);

49135

49136

// At least one of the operands should be a vector shuffle.

49137

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

49138

if (NumShuffles == 0)

49139

return false;

49140

49141

if (LMask.empty()) {

49142

A = LHS;

49143

for (unsigned i = 0; i != NumElts; ++i)

49144

LMask.push_back(i);

49145

}

49146

49147

if (RMask.empty()) {

49148

C = RHS;

49149

for (unsigned i = 0; i != NumElts; ++i)

49150

RMask.push_back(i);

49151

}

49152

49153

// If we have an unary mask, ensure the other op is set to null.

49154

if (isUndefOrInRange(LMask, 0, NumElts))

49155

B = SDValue();

49156

else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

49157

A = SDValue();

49158

49159

if (isUndefOrInRange(RMask, 0, NumElts))

49160

D = SDValue();

49161

else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

49162

C = SDValue();

49163

49164

// If A and B occur in reverse order in RHS, then canonicalize by commuting

49165

// RHS operands and shuffle mask.

49166

if (A != C) {

49167

std::swap(C, D);

49168

ShuffleVectorSDNode::commuteMask(RMask);

49169

}

49170

// Check that the shuffles are both shuffling the same vectors.

49171

if (!(A == C && B == D))

49172

return false;

49173

49174

PostShuffleMask.clear();

49175

PostShuffleMask.append(NumElts, SM_SentinelUndef);

49176

49177

// LHS and RHS are now:

49178

// LHS = shuffle A, B, LMask

49179

// RHS = shuffle A, B, RMask

49180

// Check that the masks correspond to performing a horizontal operation.

49181

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

49182

// so we just repeat the inner loop if this is a 256-bit op.

49183

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

49184

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

49185

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

49186

assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49187, __extension__
__PRETTY_FUNCTION__))

49187

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49187, __extension__
__PRETTY_FUNCTION__));

49188

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

49189

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

49190

// Ignore undefined components.

49191

int LIdx = LMask[i + j], RIdx = RMask[i + j];

49192

if (LIdx < 0 || RIdx < 0 ||

49193

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

49194

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

49195

continue;

49196

49197

// Check that successive odd/even elements are being operated on. If not,

49198

// this is not a horizontal operation.

49199

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

49200

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

49201

return false;

49202

49203

// Compute the post-shuffle mask index based on where the element

49204

// is stored in the HOP result, and where it needs to be moved to.

49205

int Base = LIdx & ~1u;

49206

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

49207

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

49208

49209

// The low half of the 128-bit result must choose from A.

49210

// The high half of the 128-bit result must choose from B,

49211

// unless B is undef. In that case, we are always choosing from A.

49212

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

49213

Index += NumEltsPer64BitChunk;

49214

PostShuffleMask[i + j] = Index;

49215

}

49216

}

49217

49218

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

49219

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

49220

49221

bool IsIdentityPostShuffle =

49222

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

49223

if (IsIdentityPostShuffle)

49224

PostShuffleMask.clear();

49225

49226

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

49227

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

49228

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

49229

return false;

49230

49231

// If the source nodes are already used in HorizOps then always accept this.

49232

// Shuffle folding should merge these back together.

49233

bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {

49234

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

49235

});

49236

bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {

49237

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

49238

});

49239

bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

49240

49241

// Assume a SingleSource HOP if we only shuffle one input and don't need to

49242

// shuffle the result.

49243

if (!ForceHorizOp &&

49244

!shouldUseHorizontalOp(NewLHS == NewRHS &&

49245

(NumShuffles < 2 || !IsIdentityPostShuffle),

49246

DAG, Subtarget))

49247

return false;

49248

49249

LHS = DAG.getBitcast(VT, NewLHS);

49250

RHS = DAG.getBitcast(VT, NewRHS);

49251

return true;

49252

}

49253

49254

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.

49255

static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

49256

const X86Subtarget &Subtarget) {

49257

EVT VT = N->getValueType(0);

49258

unsigned Opcode = N->getOpcode();

49259

bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

49260

SmallVector<int, 8> PostShuffleMask;

49261

49262

switch (Opcode) {

49263

case ISD::FADD:

49264

case ISD::FSUB:

49265

if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

49266

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

49267

SDValue LHS = N->getOperand(0);

49268

SDValue RHS = N->getOperand(1);

49269

auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

49270

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

49271

PostShuffleMask)) {

49272

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

49273

if (!PostShuffleMask.empty())

49274

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

49275

DAG.getUNDEF(VT), PostShuffleMask);

49276

return HorizBinOp;

49277

}

49278

}

49279

break;

49280

case ISD::ADD:

49281

case ISD::SUB:

49282

if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

49283

VT == MVT::v16i16 || VT == MVT::v8i32)) {

49284

SDValue LHS = N->getOperand(0);

49285

SDValue RHS = N->getOperand(1);

49286

auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

49287

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

49288

PostShuffleMask)) {

49289

auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

49290

ArrayRef<SDValue> Ops) {

49291

return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

49292

};

49293

SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

49294

{LHS, RHS}, HOpBuilder);

49295

if (!PostShuffleMask.empty())

49296

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

49297

DAG.getUNDEF(VT), PostShuffleMask);

49298

return HorizBinOp;

49299

}

49300

}

49301

break;

49302

}

49303

49304

return SDValue();

49305

}

49306

49307

// Try to combine the following nodes

49308

// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

49309

// <i32 -2147483648[float -0.000000e+00]> 0

49310

// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

49311

// <(load 4 from constant-pool)> t0, t29

49312

// [t30: v16i32 = bitcast t27]

49313

// t6: v16i32 = xor t7, t27[t30]

49314

// t11: v16f32 = bitcast t6

49315

// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

49316

// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

49317

// t22: v16f32 = bitcast t7

49318

// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

49319

// t24: v32f16 = bitcast t23

49320

static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

49321

const X86Subtarget &Subtarget) {

49322

EVT VT = N->getValueType(0);

49323

SDValue LHS = N->getOperand(0);

49324

SDValue RHS = N->getOperand(1);

49325

int CombineOpcode =

49326

N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

49327

auto isConjugationConstant = [](const Constant *c) {

49328

if (const auto *CI = dyn_cast<ConstantInt>(c)) {

49329

APInt ConjugationInt32 = APInt(32, 0x80000000, true);

49330

APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);

49331

switch (CI->getBitWidth()) {

49332

case 16:

49333

return false;

49334

case 32:

49335

return CI->getValue() == ConjugationInt32;

49336

case 64:

49337

return CI->getValue() == ConjugationInt64;

49338

default:

49339

llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49339);

49340

}

49341

}

49342

if (const auto *CF = dyn_cast<ConstantFP>(c))

49343

return CF->isNegativeZeroValue();

49344

return false;

49345

};

49346

auto combineConjugation = [&](SDValue &r) {

49347

if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {

49348

SDValue XOR = LHS.getOperand(0);

49349

if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {

49350

SDValue XORRHS = XOR.getOperand(1);

49351

if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())

49352

XORRHS = XORRHS.getOperand(0);

49353

if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&

49354

XORRHS.getOperand(1).getNumOperands()) {

49355

ConstantPoolSDNode *CP =

49356

dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));

49357

if (CP && isConjugationConstant(CP->getConstVal())) {

49358

SelectionDAG::FlagInserter FlagsInserter(DAG, N);

49359

SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

49360

SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

49361

r = DAG.getBitcast(VT, FCMulC);

49362

return true;

49363

}

49364

}

49365

}

49366

}

49367

return false;

49368

};

49369

SDValue Res;

49370

if (combineConjugation(Res))

49371

return Res;

49372

std::swap(LHS, RHS);

49373

if (combineConjugation(Res))

49374

return Res;

49375

return Res;

49376

}

49377

49378

// Try to combine the following nodes:

49379

// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)

49380

static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

49381

const X86Subtarget &Subtarget) {

49382

auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

49383

return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

49384

Flags.hasAllowContract();

49385

};

49386

49387

auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

49388

return DAG.getTarget().Options.NoSignedZerosFPMath ||

49389

Flags.hasNoSignedZeros();

49390

};

49391

auto IsVectorAllNegativeZero = [](const SDNode *N) {

49392

if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)

49393

return false;

49394

assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49395, __extension__
__PRETTY_FUNCTION__))

49395

"Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49395, __extension__
__PRETTY_FUNCTION__));

49396

if (ConstantPoolSDNode *CP =

49397

dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {

49398

APInt AI = APInt(32, 0x80008000, true);

49399

if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))

49400

return CI->getValue() == AI;

49401

if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))

49402

return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);

49403

}

49404

return false;

49405

};

49406

49407

if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

49408

!AllowContract(N->getFlags()))

49409

return SDValue();

49410

49411

EVT VT = N->getValueType(0);

49412

if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

49413

return SDValue();

49414

49415

SDValue LHS = N->getOperand(0);

49416

SDValue RHS = N->getOperand(1);

49417

bool IsConj;

49418

SDValue FAddOp1, MulOp0, MulOp1;

49419

auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

49420

&IsVectorAllNegativeZero,

49421

&HasNoSignedZero](SDValue N) -> bool {

49422

if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

49423

return false;

49424

SDValue Op0 = N.getOperand(0);

49425

unsigned Opcode = Op0.getOpcode();

49426

if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

49427

if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

49428

MulOp0 = Op0.getOperand(0);

49429

MulOp1 = Op0.getOperand(1);

49430

IsConj = Opcode == X86ISD::VFCMULC;

49431

return true;

49432

}

49433

if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

49434

((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

49435

HasNoSignedZero(Op0->getFlags())) ||

49436

IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {

49437

MulOp0 = Op0.getOperand(0);

49438

MulOp1 = Op0.getOperand(1);

49439

IsConj = Opcode == X86ISD::VFCMADDC;

49440

return true;

49441

}

49442

}

49443

return false;

49444

};

49445

49446

if (GetCFmulFrom(LHS))

49447

FAddOp1 = RHS;

49448

else if (GetCFmulFrom(RHS))

49449

FAddOp1 = LHS;

49450

else

49451

return SDValue();

49452

49453

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

49454

FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

49455

unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

49456

// FIXME: How do we handle when fast math flags of FADD are different from

49457

// CFMUL's?

49458

SDValue CFmul =

49459

DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

49460

return DAG.getBitcast(VT, CFmul);

49461

}

49462

49463

/// Do target-specific dag combines on floating-point adds/subs.

49464

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

49465

const X86Subtarget &Subtarget) {

49466

if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

49467

return HOp;

49468

49469

if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

49470

return COp;

49471

49472

return SDValue();

49473

}

49474

49475

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

49476

/// the codegen.

49477

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

49478

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

49479

/// anything that is guaranteed to be transformed by DAGCombiner.

49480

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

49481

const X86Subtarget &Subtarget,

49482

const SDLoc &DL) {

49483

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49483, __extension__
__PRETTY_FUNCTION__));

49484

SDValue Src = N->getOperand(0);

49485

unsigned SrcOpcode = Src.getOpcode();

49486

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49487

49488

EVT VT = N->getValueType(0);

49489

EVT SrcVT = Src.getValueType();

49490

49491

auto IsFreeTruncation = [VT](SDValue Op) {

49492

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

49493

49494

// See if this has been extended from a smaller/equal size to

49495

// the truncation size, allowing a truncation to combine with the extend.

49496

unsigned Opcode = Op.getOpcode();

49497

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

49498

Opcode == ISD::ZERO_EXTEND) &&

49499

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

49500

return true;

49501

49502

// See if this is a single use constant which can be constant folded.

49503

// NOTE: We don't peek throught bitcasts here because there is currently

49504

// no support for constant folding truncate+bitcast+vector_of_constants. So

49505

// we'll just send up with a truncate on both operands which will

49506

// get turned back into (truncate (binop)) causing an infinite loop.

49507

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

49508

};

49509

49510

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

49511

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

49512

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

49513

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

49514

};

49515

49516

// Don't combine if the operation has other uses.

49517

if (!Src.hasOneUse())

49518

return SDValue();

49519

49520

// Only support vector truncation for now.

49521

// TODO: i64 scalar math would benefit as well.

49522

if (!VT.isVector())

49523

return SDValue();

49524

49525

// In most cases its only worth pre-truncating if we're only facing the cost

49526

// of one truncation.

49527

// i.e. if one of the inputs will constant fold or the input is repeated.

49528

switch (SrcOpcode) {

49529

case ISD::MUL:

49530

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

49531

// better to truncate if we have the chance.

49532

if (SrcVT.getScalarType() == MVT::i64 &&

49533

TLI.isOperationLegal(SrcOpcode, VT) &&

49534

!TLI.isOperationLegal(SrcOpcode, SrcVT))

49535

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

49536

LLVM_FALLTHROUGH[[gnu::fallthrough]];

49537

case ISD::AND:

49538

case ISD::XOR:

49539

case ISD::OR:

49540

case ISD::ADD:

49541

case ISD::SUB: {

49542

SDValue Op0 = Src.getOperand(0);

49543

SDValue Op1 = Src.getOperand(1);

49544

if (TLI.isOperationLegal(SrcOpcode, VT) &&

49545

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

49546

return TruncateArithmetic(Op0, Op1);

49547

break;

49548

}

49549

}

49550

49551

return SDValue();

49552

}

49553

49554

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

49555

/// e.g. trunc <8 x i32> X to <8 x i16> -->

49556

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

49557

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

49558

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

49559

const X86Subtarget &Subtarget,

49560

SelectionDAG &DAG) {

49561

SDValue In = N->getOperand(0);

49562

EVT InVT = In.getValueType();

49563

EVT OutVT = N->getValueType(0);

49564

49565

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

49566

OutVT.getScalarSizeInBits());

49567

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

49568

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

49569

}

49570

49571

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

49572

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

49573

const X86Subtarget &Subtarget,

49574

SelectionDAG &DAG) {

49575

SDValue In = N->getOperand(0);

49576

EVT InVT = In.getValueType();

49577

EVT OutVT = N->getValueType(0);

49578

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

49579

DAG.getValueType(OutVT));

49580

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

49581

}

49582

49583

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

49584

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

49585

/// legalization the truncation will be translated into a BUILD_VECTOR with each

49586

/// element that is extracted from a vector and then truncated, and it is

49587

/// difficult to do this optimization based on them.

49588

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

49589

const X86Subtarget &Subtarget) {

49590

EVT OutVT = N->getValueType(0);

49591

if (!OutVT.isVector())

49592

return SDValue();

49593

49594

SDValue In = N->getOperand(0);

49595

if (!In.getValueType().isSimple())

49596

return SDValue();

49597

49598

EVT InVT = In.getValueType();

49599

unsigned NumElems = OutVT.getVectorNumElements();

49600

49601

// AVX512 provides fast truncate ops.

49602

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

49603

return SDValue();

49604

49605

EVT OutSVT = OutVT.getVectorElementType();

49606

EVT InSVT = InVT.getVectorElementType();

49607

if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&

49608

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

49609

NumElems >= 8))

49610

return SDValue();

49611

49612

// SSSE3's pshufb results in less instructions in the cases below.

49613

if (Subtarget.hasSSSE3() && NumElems == 8) {

49614

if (InSVT == MVT::i16)

49615

return SDValue();

49616

if (InSVT == MVT::i32 &&

49617

(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))

49618

return SDValue();

49619

}

49620

49621

SDLoc DL(N);

49622

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

49623

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

49624

// truncate 2 x v4i32 to v8i16.

49625

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

49626

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

49627

if (InSVT == MVT::i32)

49628

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

49629

49630

return SDValue();

49631

}

49632

49633

/// This function transforms vector truncation of 'extended sign-bits' or

49634

/// 'extended zero-bits' values.

49635

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

49636

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

49637

SelectionDAG &DAG,

49638

const X86Subtarget &Subtarget) {

49639

// Requires SSE2.

49640

if (!Subtarget.hasSSE2())

49641

return SDValue();

49642

49643

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

49644

return SDValue();

49645

49646

SDValue In = N->getOperand(0);

49647

if (!In.getValueType().isSimple())

49648

return SDValue();

49649

49650

MVT VT = N->getValueType(0).getSimpleVT();

49651

MVT SVT = VT.getScalarType();

49652

49653

MVT InVT = In.getValueType().getSimpleVT();

49654

MVT InSVT = InVT.getScalarType();

49655

49656

// Check we have a truncation suited for PACKSS/PACKUS.

49657

if (!isPowerOf2_32(VT.getVectorNumElements()))

49658

return SDValue();

49659

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

49660

return SDValue();

49661

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

49662

return SDValue();

49663

49664

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

49665

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

49666

return SDValue();

49667

49668

// AVX512 has fast truncate, but if the input is already going to be split,

49669

// there's no harm in trying pack.

49670

if (Subtarget.hasAVX512() &&

49671

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

49672

InVT.is512BitVector())) {

49673

// PACK should still be worth it for 128-bit vectors if the sources were

49674

// originally concatenated from subvectors.

49675

SmallVector<SDValue> ConcatOps;

49676

if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))

49677

return SDValue();

49678

}

49679

49680

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

49681

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

49682

49683

// Use PACKUS if the input has zero-bits that extend all the way to the

49684

// packed/truncated value. e.g. masks, zext_in_reg, etc.

49685

KnownBits Known = DAG.computeKnownBits(In);

49686

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

49687

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

49688

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

49689

49690

// Use PACKSS if the input has sign-bits that extend all the way to the

49691

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

49692

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

49693

49694

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

49695

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

49696

// on and combines/simplifications can't then use it.

49697

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

49698

return SDValue();

49699

49700

unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;

49701

if (NumSignBits > MinSignBits)

49702

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

49703

49704

// If we have a srl that only generates signbits that we will discard in

49705

// the truncation then we can use PACKSS by converting the srl to a sra.

49706

// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

49707

if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))

49708

if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(

49709

In, APInt::getAllOnes(VT.getVectorNumElements()))) {

49710

if (*ShAmt == MinSignBits) {

49711

SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());

49712

return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,

49713

Subtarget);

49714

}

49715

}

49716

49717

return SDValue();

49718

}

49719

49720

// Try to form a MULHU or MULHS node by looking for

49721

// (trunc (srl (mul ext, ext), 16))

49722

// TODO: This is X86 specific because we want to be able to handle wide types

49723

// before type legalization. But we can only do it if the vector will be

49724

// legalized via widening/splitting. Type legalization can't handle promotion

49725

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

49726

// combiner.

49727

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

49728

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

49729

// First instruction should be a right shift of a multiply.

49730

if (Src.getOpcode() != ISD::SRL ||

49731

Src.getOperand(0).getOpcode() != ISD::MUL)

49732

return SDValue();

49733

49734

if (!Subtarget.hasSSE2())

49735

return SDValue();

49736

49737

// Only handle vXi16 types that are at least 128-bits unless they will be

49738

// widened.

49739

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

49740

return SDValue();

49741

49742

// Input type should be at least vXi32.

49743

EVT InVT = Src.getValueType();

49744

if (InVT.getVectorElementType().getSizeInBits() < 32)

49745

return SDValue();

49746

49747

// Need a shift by 16.

49748

APInt ShiftAmt;

49749

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

49750

ShiftAmt != 16)

49751

return SDValue();

49752

49753

SDValue LHS = Src.getOperand(0).getOperand(0);

49754

SDValue RHS = Src.getOperand(0).getOperand(1);

49755

49756

// Count leading sign/zero bits on both inputs - if there are enough then

49757

// truncation back to vXi16 will be cheap - either as a pack/shuffle

49758

// sequence or using AVX512 truncations. If the inputs are sext/zext then the

49759

// truncations may actually be free by peeking through to the ext source.

49760

auto IsSext = [&DAG](SDValue V) {

49761

return DAG.ComputeMaxSignificantBits(V) <= 16;

49762

};

49763

auto IsZext = [&DAG](SDValue V) {

49764

return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

49765

};

49766

49767

bool IsSigned = IsSext(LHS) && IsSext(RHS);

49768

bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

49769

if (!IsSigned && !IsUnsigned)

49770

return SDValue();

49771

49772

// Check if both inputs are extensions, which will be removed by truncation.

49773

bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||

49774

LHS.getOpcode() == ISD::ZERO_EXTEND) &&

49775

(RHS.getOpcode() == ISD::SIGN_EXTEND ||

49776

RHS.getOpcode() == ISD::ZERO_EXTEND) &&

49777

LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&

49778

RHS.getOperand(0).getScalarValueSizeInBits() <= 16;

49779

49780

// For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

49781

// the (bitcasted) inputs directly, and then cheaply pack/truncate the result

49782

// (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

49783

// will have to split anyway.

49784

unsigned InSizeInBits = InVT.getSizeInBits();

49785

if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

49786

!(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

49787

(InSizeInBits % 16) == 0) {

49788

EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

49789

InVT.getSizeInBits() / 16);

49790

SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

49791

DAG.getBitcast(BCVT, RHS));

49792

return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

49793

}

49794

49795

// Truncate back to source type.

49796

LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

49797

RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);

49798

49799

unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

49800

return DAG.getNode(Opc, DL, VT, LHS, RHS);

49801

}

49802

49803

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

49804

// from one vector with signed bytes from another vector, adds together

49805

// adjacent pairs of 16-bit products, and saturates the result before

49806

// truncating to 16-bits.

49807

//

49808

// Which looks something like this:

49809

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

49810

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

49811

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

49812

const X86Subtarget &Subtarget,

49813

const SDLoc &DL) {

49814

if (!VT.isVector() || !Subtarget.hasSSSE3())

49815

return SDValue();

49816

49817

unsigned NumElems = VT.getVectorNumElements();

49818

EVT ScalarVT = VT.getVectorElementType();

49819

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

49820

return SDValue();

49821

49822

SDValue SSatVal = detectSSatPattern(In, VT);

49823

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

49824

return SDValue();

49825

49826

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

49827

// of multiplies from even/odd elements.

49828

SDValue N0 = SSatVal.getOperand(0);

49829

SDValue N1 = SSatVal.getOperand(1);

49830

49831

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

49832

return SDValue();

49833

49834

SDValue N00 = N0.getOperand(0);

49835

SDValue N01 = N0.getOperand(1);

49836

SDValue N10 = N1.getOperand(0);

49837

SDValue N11 = N1.getOperand(1);

49838

49839

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

49840

// Canonicalize zero_extend to LHS.

49841

if (N01.getOpcode() == ISD::ZERO_EXTEND)

49842

std::swap(N00, N01);

49843

if (N11.getOpcode() == ISD::ZERO_EXTEND)

49844

std::swap(N10, N11);

49845

49846

// Ensure we have a zero_extend and a sign_extend.

49847

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

49848

N01.getOpcode() != ISD::SIGN_EXTEND ||

49849

N10.getOpcode() != ISD::ZERO_EXTEND ||

49850

N11.getOpcode() != ISD::SIGN_EXTEND)

49851

return SDValue();

49852

49853

// Peek through the extends.

49854

N00 = N00.getOperand(0);

49855

N01 = N01.getOperand(0);

49856

N10 = N10.getOperand(0);

49857

N11 = N11.getOperand(0);

49858

49859

// Ensure the extend is from vXi8.

49860

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

49861

N01.getValueType().getVectorElementType() != MVT::i8 ||

49862

N10.getValueType().getVectorElementType() != MVT::i8 ||

49863

N11.getValueType().getVectorElementType() != MVT::i8)

49864

return SDValue();

49865

49866

// All inputs should be build_vectors.

49867

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

49868

N01.getOpcode() != ISD::BUILD_VECTOR ||

49869

N10.getOpcode() != ISD::BUILD_VECTOR ||

49870

N11.getOpcode() != ISD::BUILD_VECTOR)

49871

return SDValue();

49872

49873

// N00/N10 are zero extended. N01/N11 are sign extended.

49874

49875

// For each element, we need to ensure we have an odd element from one vector

49876

// multiplied by the odd element of another vector and the even element from

49877

// one of the same vectors being multiplied by the even element from the

49878

// other vector. So we need to make sure for each element i, this operator

49879

// is being performed:

49880

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

49881

SDValue ZExtIn, SExtIn;

49882

for (unsigned i = 0; i != NumElems; ++i) {

49883

SDValue N00Elt = N00.getOperand(i);

49884

SDValue N01Elt = N01.getOperand(i);

49885

SDValue N10Elt = N10.getOperand(i);

49886

SDValue N11Elt = N11.getOperand(i);

49887

// TODO: Be more tolerant to undefs.

49888

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

49889

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

49890

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

49891

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

49892

return SDValue();

49893

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

49894

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

49895

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

49896

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

49897

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

49898

return SDValue();

49899

unsigned IdxN00 = ConstN00Elt->getZExtValue();

49900

unsigned IdxN01 = ConstN01Elt->getZExtValue();

49901

unsigned IdxN10 = ConstN10Elt->getZExtValue();

49902

unsigned IdxN11 = ConstN11Elt->getZExtValue();

49903

// Add is commutative so indices can be reordered.

49904

if (IdxN00 > IdxN10) {

49905

std::swap(IdxN00, IdxN10);

49906

std::swap(IdxN01, IdxN11);

49907

}

49908

// N0 indices be the even element. N1 indices must be the next odd element.

49909

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

49910

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

49911

return SDValue();

49912

SDValue N00In = N00Elt.getOperand(0);

49913

SDValue N01In = N01Elt.getOperand(0);

49914

SDValue N10In = N10Elt.getOperand(0);

49915

SDValue N11In = N11Elt.getOperand(0);

49916

// First time we find an input capture it.

49917

if (!ZExtIn) {

49918

ZExtIn = N00In;

49919

SExtIn = N01In;

49920

}

49921

if (ZExtIn != N00In || SExtIn != N01In ||

49922

ZExtIn != N10In || SExtIn != N11In)

49923

return SDValue();

49924

}

49925

49926

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

49927

ArrayRef<SDValue> Ops) {

49928

// Shrink by adding truncate nodes and let DAGCombine fold with the

49929

// sources.

49930

EVT InVT = Ops[0].getValueType();

49931

assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49932, __extension__
__PRETTY_FUNCTION__))

49932

"Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49932, __extension__
__PRETTY_FUNCTION__));

49933

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49933, __extension__
__PRETTY_FUNCTION__));

49934

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

49935

InVT.getVectorNumElements() / 2);

49936

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

49937

};

49938

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

49939

PMADDBuilder);

49940

}

49941

49942

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

49943

const X86Subtarget &Subtarget) {

49944

EVT VT = N->getValueType(0);

49945

SDValue Src = N->getOperand(0);

49946

SDLoc DL(N);

49947

49948

// Attempt to pre-truncate inputs to arithmetic ops instead.

49949

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

49950

return V;

49951

49952

// Try to detect AVG pattern first.

49953

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

49954

return Avg;

49955

49956

// Try to detect PMADD

49957

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

49958

return PMAdd;

49959

49960

// Try to combine truncation with signed/unsigned saturation.

49961

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

49962

return Val;

49963

49964

// Try to combine PMULHUW/PMULHW for vXi16.

49965

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

49966

return V;

49967

49968

// The bitcast source is a direct mmx result.

49969

// Detect bitcasts between i32 to x86mmx

49970

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

49971

SDValue BCSrc = Src.getOperand(0);

49972

if (BCSrc.getValueType() == MVT::x86mmx)

49973

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

49974

}

49975

49976

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

49977

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

49978

return V;

49979

49980

return combineVectorTruncation(N, DAG, Subtarget);

49981

}

49982

49983

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

49984

TargetLowering::DAGCombinerInfo &DCI) {

49985

EVT VT = N->getValueType(0);

49986

SDValue In = N->getOperand(0);

49987

SDLoc DL(N);

49988

49989

if (auto SSatVal = detectSSatPattern(In, VT))

49990

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

49991

if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))

49992

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

49993

49994

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49995

APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

49996

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

49997

return SDValue(N, 0);

49998

49999

return SDValue();

50000

}

50001

50002

/// Returns the negated value if the node \p N flips sign of FP value.

50003

///

50004

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

50005

/// or FSUB(0, x)

50006

/// AVX512F does not have FXOR, so FNEG is lowered as

50007

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

50008

/// In this case we go though all bitcasts.

50009

/// This also recognizes splat of a negated value and returns the splat of that

50010

/// value.

50011

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

50012

if (N->getOpcode() == ISD::FNEG)

50013

return N->getOperand(0);

50014

50015

// Don't recurse exponentially.

50016

if (Depth > SelectionDAG::MaxRecursionDepth)

50017

return SDValue();

50018

50019

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

50020

50021

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

50022

EVT VT = Op->getValueType(0);

50023

50024

// Make sure the element size doesn't change.

50025

if (VT.getScalarSizeInBits() != ScalarSize)

50026

return SDValue();

50027

50028

unsigned Opc = Op.getOpcode();

50029

switch (Opc) {

50030

case ISD::VECTOR_SHUFFLE: {

50031

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

50032

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

50033

if (!Op.getOperand(1).isUndef())

50034

return SDValue();

50035

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

50036

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

50037

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

50038

cast<ShuffleVectorSDNode>(Op)->getMask());

50039

break;

50040

}

50041

case ISD::INSERT_VECTOR_ELT: {

50042

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

50043

// -V, INDEX).

50044

SDValue InsVector = Op.getOperand(0);

50045

SDValue InsVal = Op.getOperand(1);

50046

if (!InsVector.isUndef())

50047

return SDValue();

50048

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

50049

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

50050

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

50051

NegInsVal, Op.getOperand(2));

50052

break;

50053

}

50054

case ISD::FSUB:

50055

case ISD::XOR:

50056

case X86ISD::FXOR: {

50057

SDValue Op1 = Op.getOperand(1);

50058

SDValue Op0 = Op.getOperand(0);

50059

50060

// For XOR and FXOR, we want to check if constant

50061

// bits of Op1 are sign bit masks. For FSUB, we

50062

// have to check if constant bits of Op0 are sign

50063

// bit masks and hence we swap the operands.

50064

if (Opc == ISD::FSUB)

50065

std::swap(Op0, Op1);

50066

50067

APInt UndefElts;

50068

SmallVector<APInt, 16> EltBits;

50069

// Extract constant bits and see if they are all

50070

// sign bit masks. Ignore the undef elements.

50071

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

50072

/* AllowWholeUndefs */ true,

50073

/* AllowPartialUndefs */ false)) {

50074

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

50075

if (!UndefElts[I] && !EltBits[I].isSignMask())

50076

return SDValue();

50077

50078

return peekThroughBitcasts(Op0);

50079

}

50080

}

50081

}

50082

50083

return SDValue();

50084

}

50085

50086

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

50087

bool NegRes) {

50088

if (NegMul) {

50089

switch (Opcode) {

50090

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50090);

50091

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

50092

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

50093

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

50094

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

50095

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

50096

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

50097

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

50098

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

50099

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

50100

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

50101

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

50102

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

50103

}

50104

}

50105

50106

if (NegAcc) {

50107

switch (Opcode) {

50108

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50108);

50109

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

50110

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

50111

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

50112

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

50113

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

50114

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

50115

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

50116

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

50117

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

50118

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

50119

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

50120

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

50121

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

50122

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

50123

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

50124

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

50125

}

50126

}

50127

50128

if (NegRes) {

50129

switch (Opcode) {

50130

// For accuracy reason, we never combine fneg and fma under strict FP.

50131

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50131);

50132

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

50133

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

50134

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

50135

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

50136

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

50137

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

50138

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

50139

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

50140

}

50141

}

50142

50143

return Opcode;

50144

}

50145

50146

/// Do target-specific dag combines on floating point negations.

50147

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

50148

TargetLowering::DAGCombinerInfo &DCI,

50149

const X86Subtarget &Subtarget) {

50150

EVT OrigVT = N->getValueType(0);

50151

SDValue Arg = isFNEG(DAG, N);

50152

if (!Arg)

50153

return SDValue();

50154

50155

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50156

EVT VT = Arg.getValueType();

50157

EVT SVT = VT.getScalarType();

50158

SDLoc DL(N);

50159

50160

// Let legalize expand this if it isn't a legal type yet.

50161

if (!TLI.isTypeLegal(VT))

50162

return SDValue();

50163

50164

// If we're negating a FMUL node on a target with FMA, then we can avoid the

50165

// use of a constant by performing (-0 - A*B) instead.

50166

// FIXME: Check rounding control flags as well once it becomes available.

50167

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

50168

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

50169

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

50170

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

50171

Arg.getOperand(1), Zero);

50172

return DAG.getBitcast(OrigVT, NewNode);

50173

}

50174

50175

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

50176

bool LegalOperations = !DCI.isBeforeLegalizeOps();

50177

if (SDValue NegArg =

50178

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

50179

return DAG.getBitcast(OrigVT, NegArg);

50180

50181

return SDValue();

50182

}

50183

50184

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

50185

bool LegalOperations,

50186

bool ForCodeSize,

50187

NegatibleCost &Cost,

50188

unsigned Depth) const {

50189

// fneg patterns are removable even if they have multiple uses.

50190

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

50191

Cost = NegatibleCost::Cheaper;

50192

return DAG.getBitcast(Op.getValueType(), Arg);

50193

}

50194

50195

EVT VT = Op.getValueType();

50196

EVT SVT = VT.getScalarType();

50197

unsigned Opc = Op.getOpcode();

50198

SDNodeFlags Flags = Op.getNode()->getFlags();

50199

switch (Opc) {

50200

case ISD::FMA:

50201

case X86ISD::FMSUB:

50202

case X86ISD::FNMADD:

50203

case X86ISD::FNMSUB:

50204

case X86ISD::FMADD_RND:

50205

case X86ISD::FMSUB_RND:

50206

case X86ISD::FNMADD_RND:

50207

case X86ISD::FNMSUB_RND: {

50208

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

50209

!(SVT == MVT::f32 || SVT == MVT::f64) ||

50210

!isOperationLegal(ISD::FMA, VT))

50211

break;

50212

50213

// Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

50214

// if it may have signed zeros.

50215

if (!Flags.hasNoSignedZeros())

50216

break;

50217

50218

// This is always negatible for free but we might be able to remove some

50219

// extra operand negations as well.

50220

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

50221

for (int i = 0; i != 3; ++i)

50222

NewOps[i] = getCheaperNegatedExpression(

50223

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

50224

50225

bool NegA = !!NewOps[0];

50226

bool NegB = !!NewOps[1];

50227

bool NegC = !!NewOps[2];

50228

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

50229

50230

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

50231

: NegatibleCost::Neutral;

50232

50233

// Fill in the non-negated ops with the original values.

50234

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

50235

if (!NewOps[i])

50236

NewOps[i] = Op.getOperand(i);

50237

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

50238

}

50239

case X86ISD::FRCP:

50240

if (SDValue NegOp0 =

50241

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

50242

ForCodeSize, Cost, Depth + 1))

50243

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

50244

break;

50245

}

50246

50247

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

50248

ForCodeSize, Cost, Depth);

50249

}

50250

50251

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

50252

const X86Subtarget &Subtarget) {

50253

MVT VT = N->getSimpleValueType(0);

50254

// If we have integer vector types available, use the integer opcodes.

50255

if (!VT.isVector() || !Subtarget.hasSSE2())

50256

return SDValue();

50257

50258

SDLoc dl(N);

50259

50260

unsigned IntBits = VT.getScalarSizeInBits();

50261

MVT IntSVT = MVT::getIntegerVT(IntBits);

50262

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

50263

50264

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

50265

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

50266

unsigned IntOpcode;

50267

switch (N->getOpcode()) {

50268

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50268);

50269

case X86ISD::FOR: IntOpcode = ISD::OR; break;

50270

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

50271

case X86ISD::FAND: IntOpcode = ISD::AND; break;

50272

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

50273

}

50274

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

50275

return DAG.getBitcast(VT, IntOp);

50276

}

50277

50278

50279

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

50280

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

50281

if (N->getOpcode() != ISD::XOR)

50282

return SDValue();

50283

50284

SDValue LHS = N->getOperand(0);

50285

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

50286

return SDValue();

50287

50288

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

50289

X86::CondCode(LHS->getConstantOperandVal(0)));

50290

SDLoc DL(N);

50291

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

50292

}

50293

50294

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

50295

TargetLowering::DAGCombinerInfo &DCI,

50296

const X86Subtarget &Subtarget) {

50297

SDValue N0 = N->getOperand(0);

50298

SDValue N1 = N->getOperand(1);

50299

EVT VT = N->getValueType(0);

50300

50301

// If this is SSE1 only convert to FXOR to avoid scalarization.

50302

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

50303

return DAG.getBitcast(MVT::v4i32,

50304

DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

50305

DAG.getBitcast(MVT::v4f32, N0),

50306

DAG.getBitcast(MVT::v4f32, N1)));

50307

}

50308

50309

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

50310

return Cmp;

50311

50312

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50313

return R;

50314

50315

if (SDValue R = combineBitOpWithShift(N, DAG))

50316

return R;

50317

50318

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50319

return FPLogic;

50320

50321

if (DCI.isBeforeLegalizeOps())

50322

return SDValue();

50323

50324

if (SDValue SetCC = foldXor1SetCC(N, DAG))

50325

return SetCC;

50326

50327

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

50328

return RV;

50329

50330

// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

50331

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50332

if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

50333

N0.getOperand(0).getValueType().isVector() &&

50334

N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

50335

TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

50336

return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),

50337

N0.getOperand(0).getValueType()));

50338

}

50339

50340

// Handle AVX512 mask widening.

50341

// Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

50342

if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

50343

VT.getVectorElementType() == MVT::i1 &&

50344

N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

50345

TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

50346

return DAG.getNode(

50347

ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),

50348

DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),

50349

N0.getOperand(2));

50350

}

50351

50352

// Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

50353

// Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

50354

// TODO: Under what circumstances could this be performed in DAGCombine?

50355

if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

50356

N0.getOperand(0).getOpcode() == N->getOpcode()) {

50357

SDValue TruncExtSrc = N0.getOperand(0);

50358

auto *N1C = dyn_cast<ConstantSDNode>(N1);

50359

auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

50360

if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

50361

SDLoc DL(N);

50362

SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

50363

SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

50364

return DAG.getNode(ISD::XOR, DL, VT, LHS,

50365

DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

50366

}

50367

}

50368

50369

return combineFneg(N, DAG, DCI, Subtarget);

50370

}

50371

50372

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

50373

TargetLowering::DAGCombinerInfo &DCI,

50374

const X86Subtarget &Subtarget) {

50375

EVT VT = N->getValueType(0);

50376

unsigned NumBits = VT.getSizeInBits();

50377

50378

// TODO - Constant Folding.

50379

50380

// Simplify the inputs.

50381

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50382

APInt DemandedMask(APInt::getAllOnes(NumBits));

50383

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

50384

return SDValue(N, 0);

50385

50386

return SDValue();

50387

}

50388

50389

static bool isNullFPScalarOrVectorConst(SDValue V) {

50390

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

50391

}

50392

50393

/// If a value is a scalar FP zero or a vector FP zero (potentially including

50394

/// undefined elements), return a zero constant that may be used to fold away

50395

/// that value. In the case of a vector, the returned constant will not contain

50396

/// undefined elements even if the input parameter does. This makes it suitable

50397

/// to be used as a replacement operand with operations (eg, bitwise-and) where

50398

/// an undef should not propagate.

50399

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

50400

const X86Subtarget &Subtarget) {

50401

if (!isNullFPScalarOrVectorConst(V))

50402

return SDValue();

50403

50404

if (V.getValueType().isVector())

50405

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

50406

50407

return V;

50408

}

50409

50410

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

50411

const X86Subtarget &Subtarget) {

50412

SDValue N0 = N->getOperand(0);

50413

SDValue N1 = N->getOperand(1);

50414

EVT VT = N->getValueType(0);

50415

SDLoc DL(N);

50416

50417

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

50418

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

50419

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

50420

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

50421

return SDValue();

50422

50423

auto isAllOnesConstantFP = [](SDValue V) {

50424

if (V.getSimpleValueType().isVector())

50425

return ISD::isBuildVectorAllOnes(V.getNode());

50426

auto *C = dyn_cast<ConstantFPSDNode>(V);

50427

return C && C->getConstantFPValue()->isAllOnesValue();

50428

};

50429

50430

// fand (fxor X, -1), Y --> fandn X, Y

50431

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

50432

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

50433

50434

// fand X, (fxor Y, -1) --> fandn Y, X

50435

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

50436

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

50437

50438

return SDValue();

50439

}

50440

50441

/// Do target-specific dag combines on X86ISD::FAND nodes.

50442

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

50443

const X86Subtarget &Subtarget) {

50444

// FAND(0.0, x) -> 0.0

50445

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

50446

return V;

50447

50448

// FAND(x, 0.0) -> 0.0

50449

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

50450

return V;

50451

50452

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

50453

return V;

50454

50455

return lowerX86FPLogicOp(N, DAG, Subtarget);

50456

}

50457

50458

/// Do target-specific dag combines on X86ISD::FANDN nodes.

50459

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

50460

const X86Subtarget &Subtarget) {

50461

// FANDN(0.0, x) -> x

50462

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

50463

return N->getOperand(1);

50464

50465

// FANDN(x, 0.0) -> 0.0

50466

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

50467

return V;

50468

50469

return lowerX86FPLogicOp(N, DAG, Subtarget);

50470

}

50471

50472

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

50473

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

50474

TargetLowering::DAGCombinerInfo &DCI,

50475

const X86Subtarget &Subtarget) {

50476

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50476, __extension__
__PRETTY_FUNCTION__));

50477

50478

// F[X]OR(0.0, x) -> x

50479

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

50480

return N->getOperand(1);

50481

50482

// F[X]OR(x, 0.0) -> x

50483

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

50484

return N->getOperand(0);

50485

50486

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

50487

return NewVal;

50488

50489

return lowerX86FPLogicOp(N, DAG, Subtarget);

50490

}

50491

50492

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

50493

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

50494

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50494, __extension__
__PRETTY_FUNCTION__));

50495

50496

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

50497

if (!DAG.getTarget().Options.NoNaNsFPMath ||

50498

!DAG.getTarget().Options.NoSignedZerosFPMath)

50499

return SDValue();

50500

50501

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

50502

// into FMINC and FMAXC, which are Commutative operations.

50503

unsigned NewOp = 0;

50504

switch (N->getOpcode()) {

50505

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50505);

50506

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

50507

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

50508

}

50509

50510

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

50511

N->getOperand(0), N->getOperand(1));

50512

}

50513

50514

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

50515

const X86Subtarget &Subtarget) {

50516

if (Subtarget.useSoftFloat())

50517

return SDValue();

50518

50519

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50520

50521

EVT VT = N->getValueType(0);

50522

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

50523

(Subtarget.hasSSE2() && VT == MVT::f64) ||

50524

(Subtarget.hasFP16() && VT == MVT::f16) ||

50525

(VT.isVector() && TLI.isTypeLegal(VT))))

50526

return SDValue();

50527

50528

SDValue Op0 = N->getOperand(0);

50529

SDValue Op1 = N->getOperand(1);

50530

SDLoc DL(N);

50531

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

50532

50533

// If we don't have to respect NaN inputs, this is a direct translation to x86

50534

// min/max instructions.

50535

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

50536

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

50537

50538

// If one of the operands is known non-NaN use the native min/max instructions

50539

// with the non-NaN input as second operand.

50540

if (DAG.isKnownNeverNaN(Op1))

50541

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

50542

if (DAG.isKnownNeverNaN(Op0))

50543

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

50544

50545

// If we have to respect NaN inputs, this takes at least 3 instructions.

50546

// Favor a library call when operating on a scalar and minimizing code size.

50547

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

50548

return SDValue();

50549

50550

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

50551

VT);

50552

50553

// There are 4 possibilities involving NaN inputs, and these are the required

50554

// outputs:

50555

// Op1

50556

// Num NaN

50557

// ----------------

50558

// Num | Max | Op0 |

50559

// Op0 ----------------

50560

// NaN | Op1 | NaN |

50561

// ----------------

50562

//

50563

// The SSE FP max/min instructions were not designed for this case, but rather

50564

// to implement:

50565

// Min = Op1 < Op0 ? Op1 : Op0

50566

// Max = Op1 > Op0 ? Op1 : Op0

50567

//

50568

// So they always return Op0 if either input is a NaN. However, we can still

50569

// use those instructions for fmaxnum by selecting away a NaN input.

50570

50571

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

50572

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

50573

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

50574

50575

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

50576

// are NaN, the NaN value of Op1 is the result.

50577

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

50578

}

50579

50580

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

50581

TargetLowering::DAGCombinerInfo &DCI) {

50582

EVT VT = N->getValueType(0);

50583

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50584

50585

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

50586

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

50587

return SDValue(N, 0);

50588

50589

// Convert a full vector load into vzload when not all bits are needed.

50590

SDValue In = N->getOperand(0);

50591

MVT InVT = In.getSimpleValueType();

50592

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

50593

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

50594

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50594, __extension__
__PRETTY_FUNCTION__));

50595

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

50596

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

50597

MVT MemVT = MVT::getIntegerVT(NumBits);

50598

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

50599

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

50600

SDLoc dl(N);

50601

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

50602

DAG.getBitcast(InVT, VZLoad));

50603

DCI.CombineTo(N, Convert);

50604

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

50605

DCI.recursivelyDeleteUnusedNodes(LN);

50606

return SDValue(N, 0);

50607

}

50608

}

50609

50610

return SDValue();

50611

}

50612

50613

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

50614

TargetLowering::DAGCombinerInfo &DCI) {

50615

bool IsStrict = N->isTargetStrictFPOpcode();

50616

EVT VT = N->getValueType(0);

50617

50618

// Convert a full vector load into vzload when not all bits are needed.

50619

SDValue In = N->getOperand(IsStrict ? 1 : 0);

50620

MVT InVT = In.getSimpleValueType();

50621

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

50622

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

50623

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50623, __extension__
__PRETTY_FUNCTION__));

50624

LoadSDNode *LN = cast<LoadSDNode>(In);

50625

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

50626

MVT MemVT = MVT::getFloatingPointVT(NumBits);

50627

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

50628

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

50629

SDLoc dl(N);

50630

if (IsStrict) {

50631

SDValue Convert =

50632

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

50633

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

50634

DCI.CombineTo(N, Convert, Convert.getValue(1));

50635

} else {

50636

SDValue Convert =

50637

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

50638

DCI.CombineTo(N, Convert);

50639

}

50640

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

50641

DCI.recursivelyDeleteUnusedNodes(LN);

50642

return SDValue(N, 0);

50643

}

50644

}

50645

50646

return SDValue();

50647

}

50648

50649

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

50650

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

50651

TargetLowering::DAGCombinerInfo &DCI,

50652

const X86Subtarget &Subtarget) {

50653

SDValue N0 = N->getOperand(0);

50654

SDValue N1 = N->getOperand(1);

50655

MVT VT = N->getSimpleValueType(0);

50656

50657

// ANDNP(0, x) -> x

50658

if (ISD::isBuildVectorAllZeros(N0.getNode()))

50659

return N1;

50660

50661

// ANDNP(x, 0) -> 0

50662

if (ISD::isBuildVectorAllZeros(N1.getNode()))

50663

return DAG.getConstant(0, SDLoc(N), VT);

50664

50665

// Turn ANDNP back to AND if input is inverted.

50666

if (SDValue Not = IsNOT(N0, DAG))

50667

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);

50668

50669

// Attempt to recursively combine a bitmask ANDNP with shuffles.

50670

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50671

SDValue Op(N, 0);

50672

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50673

return Res;

50674

50675

// If either operand is a constant mask, then only the elements that aren't

50676

// zero are actually demanded by the other operand.

50677

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

50678

APInt UndefElts;

50679

SmallVector<APInt> EltBits;

50680

int NumElts = VT.getVectorNumElements();

50681

int EltSizeInBits = VT.getScalarSizeInBits();

50682

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

50683

APInt DemandedElts = APInt::getAllOnes(NumElts);

50684

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

50685

EltBits)) {

50686

DemandedBits.clearAllBits();

50687

DemandedElts.clearAllBits();

50688

for (int I = 0; I != NumElts; ++I)

50689

if ((Invert && !EltBits[I].isAllOnes()) ||

50690

(!Invert && !EltBits[I].isZero())) {

50691

DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

50692

DemandedElts.setBit(I);

50693

}

50694

}

50695

return std::make_pair(DemandedBits, DemandedElts);

50696

};

50697

std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);

50698

std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0, true);

50699

50700

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50701

if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||

50702

TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||

50703

TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||

50704

TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {

50705

if (N->getOpcode() != ISD::DELETED_NODE)

50706

DCI.AddToWorklist(N);

50707

return SDValue(N, 0);

50708

}

50709

}

50710

50711

return SDValue();

50712

}

50713

50714

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

50715

TargetLowering::DAGCombinerInfo &DCI) {

50716

SDValue N1 = N->getOperand(1);

50717

50718

// BT ignores high bits in the bit index operand.

50719

unsigned BitWidth = N1.getValueSizeInBits();

50720

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

50721

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

50722

if (N->getOpcode() != ISD::DELETED_NODE)

50723

DCI.AddToWorklist(N);

50724

return SDValue(N, 0);

50725

}

50726

50727

return SDValue();

50728

}

50729

50730

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

50731

TargetLowering::DAGCombinerInfo &DCI) {

50732

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

50733

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

50734

50735

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

50736

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50737

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

50738

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

50739

if (N->getOpcode() != ISD::DELETED_NODE)

50740

DCI.AddToWorklist(N);

50741

return SDValue(N, 0);

50742

}

50743

50744

// Convert a full vector load into vzload when not all bits are needed.

50745

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

50746

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

50747

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

50748

SDLoc dl(N);

50749

if (IsStrict) {

50750

SDValue Convert = DAG.getNode(

50751

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

50752

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

50753

DCI.CombineTo(N, Convert, Convert.getValue(1));

50754

} else {

50755

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

50756

DAG.getBitcast(MVT::v8i16, VZLoad));

50757

DCI.CombineTo(N, Convert);

50758

}

50759

50760

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

50761

DCI.recursivelyDeleteUnusedNodes(LN);

50762

return SDValue(N, 0);

50763

}

50764

}

50765

}

50766

50767

return SDValue();

50768

}

50769

50770

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

50771

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

50772

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50772, __extension__
__PRETTY_FUNCTION__));

50773

50774

EVT DstVT = N->getValueType(0);

50775

50776

SDValue N0 = N->getOperand(0);

50777

SDValue N1 = N->getOperand(1);

50778

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

50779

50780

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

50781

return SDValue();

50782

50783

// Look through single use any_extends / truncs.

50784

SDValue IntermediateBitwidthOp;

50785

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

50786

N0.hasOneUse()) {

50787

IntermediateBitwidthOp = N0;

50788

N0 = N0.getOperand(0);

50789

}

50790

50791

// See if we have a single use cmov.

50792

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

50793

return SDValue();

50794

50795

SDValue CMovOp0 = N0.getOperand(0);

50796

SDValue CMovOp1 = N0.getOperand(1);

50797

50798

// Make sure both operands are constants.

50799

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

50800

!isa<ConstantSDNode>(CMovOp1.getNode()))

50801

return SDValue();

50802

50803

SDLoc DL(N);

50804

50805

// If we looked through an any_extend/trunc above, add one to the constants.

50806

if (IntermediateBitwidthOp) {

50807

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

50808

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

50809

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

50810

}

50811

50812

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

50813

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

50814

50815

EVT CMovVT = DstVT;

50816

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

50817

if (DstVT == MVT::i16) {

50818

CMovVT = MVT::i32;

50819

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

50820

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

50821

}

50822

50823

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

50824

N0.getOperand(2), N0.getOperand(3));

50825

50826

if (CMovVT != DstVT)

50827

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

50828

50829

return CMov;

50830

}

50831

50832

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

50833

const X86Subtarget &Subtarget) {

50834

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50834, __extension__
__PRETTY_FUNCTION__));

50835

50836

if (SDValue V = combineSextInRegCmov(N, DAG))

50837

return V;

50838

50839

EVT VT = N->getValueType(0);

50840

SDValue N0 = N->getOperand(0);

50841

SDValue N1 = N->getOperand(1);

50842

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

50843

SDLoc dl(N);

50844

50845

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

50846

// both SSE and AVX2 since there is no sign-extended shift right

50847

// operation on a vector with 64-bit elements.

50848

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

50849

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

50850

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

50851

N0.getOpcode() == ISD::SIGN_EXTEND)) {

50852

SDValue N00 = N0.getOperand(0);

50853

50854

// EXTLOAD has a better solution on AVX2,

50855

// it may be replaced with X86ISD::VSEXT node.

50856

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

50857

if (!ISD::isNormalLoad(N00.getNode()))

50858

return SDValue();

50859

50860

// Attempt to promote any comparison mask ops before moving the

50861

// SIGN_EXTEND_INREG in the way.

50862

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

50863

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

50864

50865

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

50866

SDValue Tmp =

50867

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

50868

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

50869

}

50870

}

50871

return SDValue();

50872

}

50873

50874

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

50875

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

50876

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

50877

/// opportunities to combine math ops, use an LEA, or use a complex addressing

50878

/// mode. This can eliminate extend, add, and shift instructions.

50879

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

50880

const X86Subtarget &Subtarget) {

50881

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

50882

Ext->getOpcode() != ISD::ZERO_EXTEND)

50883

return SDValue();

50884

50885

// TODO: This should be valid for other integer types.

50886

EVT VT = Ext->getValueType(0);

50887

if (VT != MVT::i64)

50888

return SDValue();

50889

50890

SDValue Add = Ext->getOperand(0);

50891

if (Add.getOpcode() != ISD::ADD)

50892

return SDValue();

50893

50894

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

50895

bool NSW = Add->getFlags().hasNoSignedWrap();

50896

bool NUW = Add->getFlags().hasNoUnsignedWrap();

50897

50898

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

50899

// into the 'zext'

50900

if ((Sext && !NSW) || (!Sext && !NUW))

50901

return SDValue();

50902

50903

// Having a constant operand to the 'add' ensures that we are not increasing

50904

// the instruction count because the constant is extended for free below.

50905

// A constant operand can also become the displacement field of an LEA.

50906

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

50907

if (!AddOp1)

50908

return SDValue();

50909

50910

// Don't make the 'add' bigger if there's no hope of combining it with some

50911

// other 'add' or 'shl' instruction.

50912

// TODO: It may be profitable to generate simpler LEA instructions in place

50913

// of single 'add' instructions, but the cost model for selecting an LEA

50914

// currently has a high threshold.

50915

bool HasLEAPotential = false;

50916

for (auto *User : Ext->uses()) {

50917

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

50918

HasLEAPotential = true;

50919

break;

50920

}

50921

}

50922

if (!HasLEAPotential)

50923

return SDValue();

50924

50925

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

50926

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

50927

SDValue AddOp0 = Add.getOperand(0);

50928

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

50929

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

50930

50931

// The wider add is guaranteed to not wrap because both operands are

50932

// sign-extended.

50933

SDNodeFlags Flags;

50934

Flags.setNoSignedWrap(NSW);

50935

Flags.setNoUnsignedWrap(NUW);

50936

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

50937

}

50938

50939

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

50940

// operands and the result of CMOV is not used anywhere else - promote CMOV

50941

// itself instead of promoting its result. This could be beneficial, because:

50942

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

50943

// (or more) pseudo-CMOVs only when they go one-after-another and

50944

// getting rid of result extension code after CMOV will help that.

50945

// 2) Promotion of constant CMOV arguments is free, hence the

50946

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

50947

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

50948

// promotion is also good in terms of code-size.

50949

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

50950

// promotion).

50951

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

50952

SDValue CMovN = Extend->getOperand(0);

50953

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

50954

return SDValue();

50955

50956

EVT TargetVT = Extend->getValueType(0);

50957

unsigned ExtendOpcode = Extend->getOpcode();

50958

SDLoc DL(Extend);

50959

50960

EVT VT = CMovN.getValueType();

50961

SDValue CMovOp0 = CMovN.getOperand(0);

50962

SDValue CMovOp1 = CMovN.getOperand(1);

50963

50964

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

50965

!isa<ConstantSDNode>(CMovOp1.getNode()))

50966

return SDValue();

50967

50968

// Only extend to i32 or i64.

50969

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

50970

return SDValue();

50971

50972

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

50973

// are free.

50974

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

50975

return SDValue();

50976

50977

// If this a zero extend to i64, we should only extend to i32 and use a free

50978

// zero extend to finish.

50979

EVT ExtendVT = TargetVT;

50980

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

50981

ExtendVT = MVT::i32;

50982

50983

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

50984

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

50985

50986

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

50987

CMovN.getOperand(2), CMovN.getOperand(3));

50988

50989

// Finish extending if needed.

50990

if (ExtendVT != TargetVT)

50991

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

50992

50993

return Res;

50994

}

50995

50996

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

50997

// result type.

50998

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

50999

const X86Subtarget &Subtarget) {

51000

SDValue N0 = N->getOperand(0);

51001

EVT VT = N->getValueType(0);

51002

SDLoc dl(N);

51003

51004

// Only do this combine with AVX512 for vector extends.

51005

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

51006

return SDValue();

51007

51008

// Only combine legal element types.

51009

EVT SVT = VT.getVectorElementType();

51010

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

51011

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

51012

return SDValue();

51013

51014

// We don't have CMPP Instruction for vxf16

51015

if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

51016

return SDValue();

51017

// We can only do this if the vector size in 256 bits or less.

51018

unsigned Size = VT.getSizeInBits();

51019

if (Size > 256 && Subtarget.useAVX512Regs())

51020

return SDValue();

51021

51022

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

51023

// that's the only integer compares with we have.

51024

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

51025

if (ISD::isUnsignedIntSetCC(CC))

51026

return SDValue();

51027

51028

// Only do this combine if the extension will be fully consumed by the setcc.

51029

EVT N00VT = N0.getOperand(0).getValueType();

51030

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

51031

if (Size != MatchingVecType.getSizeInBits())

51032

return SDValue();

51033

51034

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

51035

51036

if (N->getOpcode() == ISD::ZERO_EXTEND)

51037

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

51038

51039

return Res;

51040

}

51041

51042

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

51043

TargetLowering::DAGCombinerInfo &DCI,

51044

const X86Subtarget &Subtarget) {

51045

SDValue N0 = N->getOperand(0);

51046

EVT VT = N->getValueType(0);

51047

SDLoc DL(N);

51048

51049

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

51050

if (!DCI.isBeforeLegalizeOps() &&

51051

N0.getOpcode() == X86ISD::SETCC_CARRY) {

51052

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

51053

N0->getOperand(1));

51054

bool ReplaceOtherUses = !N0.hasOneUse();

51055

DCI.CombineTo(N, Setcc);

51056

// Replace other uses with a truncate of the widened setcc_carry.

51057

if (ReplaceOtherUses) {

51058

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

51059

N0.getValueType(), Setcc);

51060

DCI.CombineTo(N0.getNode(), Trunc);

51061

}

51062

51063

return SDValue(N, 0);

51064

}

51065

51066

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

51067

return NewCMov;

51068

51069

if (!DCI.isBeforeLegalizeOps())

51070

return SDValue();

51071

51072

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

51073

return V;

51074

51075

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

51076

DAG, DCI, Subtarget))

51077

return V;

51078

51079

if (VT.isVector()) {

51080

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

51081

return R;

51082

51083

if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

51084

return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

51085

}

51086

51087

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

51088

return NewAdd;

51089

51090

return SDValue();

51091

}

51092

51093

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

51094

TargetLowering::DAGCombinerInfo &DCI,

51095

const X86Subtarget &Subtarget) {

51096

SDLoc dl(N);

51097

EVT VT = N->getValueType(0);

51098

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

51099

51100

// Let legalize expand this if it isn't a legal type yet.

51101

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51102

if (!TLI.isTypeLegal(VT))

51103

return SDValue();

51104

51105

SDValue A = N->getOperand(IsStrict ? 1 : 0);

51106

SDValue B = N->getOperand(IsStrict ? 2 : 1);

51107

SDValue C = N->getOperand(IsStrict ? 3 : 2);

51108

51109

// If the operation allows fast-math and the target does not support FMA,

51110

// split this into mul+add to avoid libcall(s).

51111

SDNodeFlags Flags = N->getFlags();

51112

if (!IsStrict && Flags.hasAllowReassociation() &&

51113

TLI.isOperationExpand(ISD::FMA, VT)) {

51114

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

51115

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

51116

}

51117

51118

EVT ScalarVT = VT.getScalarType();

51119

if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

51120

!Subtarget.hasAnyFMA()) &&

51121

!(ScalarVT == MVT::f16 && Subtarget.hasFP16()))

51122

return SDValue();

51123

51124

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

51125

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

51126

bool LegalOperations = !DCI.isBeforeLegalizeOps();

51127

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

51128

CodeSize)) {

51129

V = NegV;

51130

return true;

51131

}

51132

// Look through extract_vector_elts. If it comes from an FNEG, create a

51133

// new extract from the FNEG input.

51134

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

51135

isNullConstant(V.getOperand(1))) {

51136

SDValue Vec = V.getOperand(0);

51137

if (SDValue NegV = TLI.getCheaperNegatedExpression(

51138

Vec, DAG, LegalOperations, CodeSize)) {

51139

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

51140

NegV, V.getOperand(1));

51141

return true;

51142

}

51143

}

51144

51145

return false;

51146

};

51147

51148

// Do not convert the passthru input of scalar intrinsics.

51149

// FIXME: We could allow negations of the lower element only.

51150

bool NegA = invertIfNegative(A);

51151

bool NegB = invertIfNegative(B);

51152

bool NegC = invertIfNegative(C);

51153

51154

if (!NegA && !NegB && !NegC)

51155

return SDValue();

51156

51157

unsigned NewOpcode =

51158

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

51159

51160

// Propagate fast-math-flags to new FMA node.

51161

SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

51162

if (IsStrict) {

51163

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51163, __extension__
__PRETTY_FUNCTION__));

51164

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

51165

{N->getOperand(0), A, B, C});

51166

} else {

51167

if (N->getNumOperands() == 4)

51168

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

51169

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

51170

}

51171

}

51172

51173

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

51174

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

51175

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

51176

TargetLowering::DAGCombinerInfo &DCI) {

51177

SDLoc dl(N);

51178

EVT VT = N->getValueType(0);

51179

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51180

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

51181

bool LegalOperations = !DCI.isBeforeLegalizeOps();

51182

51183

SDValue N2 = N->getOperand(2);

51184

51185

SDValue NegN2 =

51186

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

51187

if (!NegN2)

51188

return SDValue();

51189

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

51190

51191

if (N->getNumOperands() == 4)

51192

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

51193

NegN2, N->getOperand(3));

51194

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

51195

NegN2);

51196

}

51197

51198

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

51199

TargetLowering::DAGCombinerInfo &DCI,

51200

const X86Subtarget &Subtarget) {

51201

SDLoc dl(N);

51202

SDValue N0 = N->getOperand(0);

51203

EVT VT = N->getValueType(0);

51204

51205

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

51206

// FIXME: Is this needed? We don't seem to have any tests for it.

51207

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

51208

N0.getOpcode() == X86ISD::SETCC_CARRY) {

51209

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

51210

N0->getOperand(1));

51211

bool ReplaceOtherUses = !N0.hasOneUse();

51212

DCI.CombineTo(N, Setcc);

51213

// Replace other uses with a truncate of the widened setcc_carry.

51214

if (ReplaceOtherUses) {

51215

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

51216

N0.getValueType(), Setcc);

51217

DCI.CombineTo(N0.getNode(), Trunc);

51218

}

51219

51220

return SDValue(N, 0);

51221

}

51222

51223

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

51224

return NewCMov;

51225

51226

if (DCI.isBeforeLegalizeOps())

51227

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

51228

return V;

51229

51230

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

51231

DAG, DCI, Subtarget))

51232

return V;

51233

51234

if (VT.isVector())

51235

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

51236

return R;

51237

51238

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

51239

return NewAdd;

51240

51241

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

51242

return R;

51243

51244

// TODO: Combine with any target/faux shuffle.

51245

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

51246

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

51247

SDValue N00 = N0.getOperand(0);

51248

SDValue N01 = N0.getOperand(1);

51249

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

51250

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

51251

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

51252

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

51253

return concatSubVectors(N00, N01, DAG, dl);

51254

}

51255

}

51256

51257

return SDValue();

51258

}

51259

51260

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

51261

/// recognizable memcmp expansion.

51262

static bool isOrXorXorTree(SDValue X, bool Root = true) {

51263

if (X.getOpcode() == ISD::OR)

51264

return isOrXorXorTree(X.getOperand(0), false) &&

51265

isOrXorXorTree(X.getOperand(1), false);

51266

if (Root)

51267

return false;

51268

return X.getOpcode() == ISD::XOR;

51269

}

51270

51271

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

51272

/// expansion.

51273

template<typename F>

51274

static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,

51275

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

51276

SDValue Op0 = X.getOperand(0);

51277

SDValue Op1 = X.getOperand(1);

51278

if (X.getOpcode() == ISD::OR) {

51279

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

51280

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

51281

if (VecVT != CmpVT)

51282

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

51283

if (HasPT)

51284

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

51285

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

51286

} else if (X.getOpcode() == ISD::XOR) {

51287

SDValue A = SToV(Op0);

51288

SDValue B = SToV(Op1);

51289

if (VecVT != CmpVT)

51290

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

51291

if (HasPT)

51292

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

51293

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

51294

}

51295

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51295);

51296

}

51297

51298

/// Try to map a 128-bit or larger integer comparison to vector instructions

51299

/// before type legalization splits it up into chunks.

51300

static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

51301

const X86Subtarget &Subtarget) {

51302

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();

51303

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51303, __extension__
__PRETTY_FUNCTION__));

51304

51305

// We're looking for an oversized integer equality comparison.

51306

SDValue X = SetCC->getOperand(0);

51307

SDValue Y = SetCC->getOperand(1);

51308

EVT OpVT = X.getValueType();

51309

unsigned OpSize = OpVT.getSizeInBits();

51310

if (!OpVT.isScalarInteger() || OpSize < 128)

51311

return SDValue();

51312

51313

// Ignore a comparison with zero because that gets special treatment in

51314

// EmitTest(). But make an exception for the special case of a pair of

51315

// logically-combined vector-sized operands compared to zero. This pattern may

51316

// be generated by the memcmp expansion pass with oversized integer compares

51317

// (see PR33325).

51318

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

51319

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

51320

return SDValue();

51321

51322

// Don't perform this combine if constructing the vector will be expensive.

51323

auto IsVectorBitCastCheap = [](SDValue X) {

51324

X = peekThroughBitcasts(X);

51325

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

51326

X.getOpcode() == ISD::LOAD;

51327

};

51328

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

51329

!IsOrXorXorTreeCCZero)

51330

return SDValue();

51331

51332

EVT VT = SetCC->getValueType(0);

51333

SDLoc DL(SetCC);

51334

51335

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

51336

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

51337

// Otherwise use PCMPEQ (plus AND) and mask testing.

51338

if ((OpSize == 128 && Subtarget.hasSSE2()) ||

51339

(OpSize == 256 && Subtarget.hasAVX()) ||

51340

(OpSize == 512 && Subtarget.useAVX512Regs())) {

51341

bool HasPT = Subtarget.hasSSE41();

51342

51343

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

51344

// vector registers are essentially free. (Technically, widening registers

51345

// prevents load folding, but the tradeoff is worth it.)

51346

bool PreferKOT = Subtarget.preferMaskRegisters();

51347

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

51348

51349

EVT VecVT = MVT::v16i8;

51350

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

51351

if (OpSize == 256) {

51352

VecVT = MVT::v32i8;

51353

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

51354

}

51355

EVT CastVT = VecVT;

51356

bool NeedsAVX512FCast = false;

51357

if (OpSize == 512 || NeedZExt) {

51358

if (Subtarget.hasBWI()) {

51359

VecVT = MVT::v64i8;

51360

CmpVT = MVT::v64i1;

51361

if (OpSize == 512)

51362

CastVT = VecVT;

51363

} else {

51364

VecVT = MVT::v16i32;

51365

CmpVT = MVT::v16i1;

51366

CastVT = OpSize == 512 ? VecVT :

51367

OpSize == 256 ? MVT::v8i32 : MVT::v4i32;

51368

NeedsAVX512FCast = true;

51369

}

51370

}

51371

51372

auto ScalarToVector = [&](SDValue X) -> SDValue {

51373

bool TmpZext = false;

51374

EVT TmpCastVT = CastVT;

51375

if (X.getOpcode() == ISD::ZERO_EXTEND) {

51376

SDValue OrigX = X.getOperand(0);

51377

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

51378

if (OrigSize < OpSize) {

51379

if (OrigSize == 128) {

51380

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

51381

X = OrigX;

51382

TmpZext = true;

51383

} else if (OrigSize == 256) {

51384

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

51385

X = OrigX;

51386

TmpZext = true;

51387

}

51388

}

51389

}

51390

X = DAG.getBitcast(TmpCastVT, X);

51391

if (!NeedZExt && !TmpZext)

51392

return X;

51393

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

51394

DAG.getConstant(0, DL, VecVT), X,

51395

DAG.getVectorIdxConstant(0, DL));

51396

};

51397

51398

SDValue Cmp;

51399

if (IsOrXorXorTreeCCZero) {

51400

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

51401

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

51402

// Use 2 vector equality compares and 'and' the results before doing a

51403

// MOVMSK.

51404

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

51405

} else {

51406

SDValue VecX = ScalarToVector(X);

51407

SDValue VecY = ScalarToVector(Y);

51408

if (VecVT != CmpVT) {

51409

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

51410

} else if (HasPT) {

51411

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

51412

} else {

51413

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

51414

}

51415

}

51416

// AVX512 should emit a setcc that will lower to kortest.

51417

if (VecVT != CmpVT) {

51418

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :

51419

CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;

51420

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

51421

DAG.getConstant(0, DL, KRegVT), CC);

51422

}

51423

if (HasPT) {

51424

SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,

51425

Cmp);

51426

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

51427

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

51428

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

51429

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

51430

}

51431

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

51432

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

51433

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

51434

assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51435, __extension__
__PRETTY_FUNCTION__))

51435

"Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51435, __extension__
__PRETTY_FUNCTION__));

51436

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

51437

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

51438

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

51439

}

51440

51441

return SDValue();

51442

}

51443

51444

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

51445

TargetLowering::DAGCombinerInfo &DCI,

51446

const X86Subtarget &Subtarget) {

51447

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

51448

const SDValue LHS = N->getOperand(0);

51449

const SDValue RHS = N->getOperand(1);

51450

EVT VT = N->getValueType(0);

51451

EVT OpVT = LHS.getValueType();

51452

SDLoc DL(N);

51453

51454

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

51455

if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))

51456

return V;

51457

51458

if (VT == MVT::i1 && isNullConstant(RHS)) {

51459

SDValue X86CC;

51460

if (SDValue V =

51461

MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))

51462

return DAG.getNode(ISD::TRUNCATE, DL, VT,

51463

DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));

51464

}

51465

51466

if (OpVT.isScalarInteger()) {

51467

// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

51468

// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

51469

auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

51470

if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

51471

if (N0.getOperand(0) == N1)

51472

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

51473

N0.getOperand(1));

51474

if (N0.getOperand(1) == N1)

51475

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

51476

N0.getOperand(0));

51477

}

51478

return SDValue();

51479

};

51480

if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

51481

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

51482

if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

51483

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

51484

51485

// cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

51486

// cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

51487

auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

51488

if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

51489

if (N0.getOperand(0) == N1)

51490

return DAG.getNode(ISD::AND, DL, OpVT, N1,

51491

DAG.getNOT(DL, N0.getOperand(1), OpVT));

51492

if (N0.getOperand(1) == N1)

51493

return DAG.getNode(ISD::AND, DL, OpVT, N1,

51494

DAG.getNOT(DL, N0.getOperand(0), OpVT));

51495

}

51496

return SDValue();

51497

};

51498

if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

51499

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

51500

if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

51501

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

51502

51503

// cmpeq(trunc(x),0) --> cmpeq(x,0)

51504

// cmpne(trunc(x),0) --> cmpne(x,0)

51505

// iff x upper bits are zero.

51506

// TODO: Add support for RHS to be truncate as well?

51507

if (LHS.getOpcode() == ISD::TRUNCATE &&

51508

LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

51509

isNullConstant(RHS) && !DCI.isBeforeLegalize()) {

51510

EVT SrcVT = LHS.getOperand(0).getValueType();

51511

APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

51512

OpVT.getScalarSizeInBits());

51513

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51514

if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

51515

TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

51516

return DAG.getSetCC(DL, VT, LHS.getOperand(0),

51517

DAG.getConstant(0, DL, SrcVT), CC);

51518

}

51519

}

51520

}

51521

51522

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

51523

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

51524

// Using temporaries to avoid messing up operand ordering for later

51525

// transformations if this doesn't work.

51526

SDValue Op0 = LHS;

51527

SDValue Op1 = RHS;

51528

ISD::CondCode TmpCC = CC;

51529

// Put build_vector on the right.

51530

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

51531

std::swap(Op0, Op1);

51532

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

51533

}

51534

51535

bool IsSEXT0 =

51536

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

51537

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

51538

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

51539

51540

if (IsSEXT0 && IsVZero1) {

51541

assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51542, __extension__
__PRETTY_FUNCTION__))

51542

"Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51542, __extension__
__PRETTY_FUNCTION__));

51543

if (TmpCC == ISD::SETGT)

51544

return DAG.getConstant(0, DL, VT);

51545

if (TmpCC == ISD::SETLE)

51546

return DAG.getConstant(1, DL, VT);

51547

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

51548

return DAG.getNOT(DL, Op0.getOperand(0), VT);

51549

51550

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51551, __extension__
__PRETTY_FUNCTION__))

51551

"Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51551, __extension__
__PRETTY_FUNCTION__));

51552

return Op0.getOperand(0);

51553

}

51554

}

51555

51556

// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

51557

// pre-promote its result type since vXi1 vectors don't get promoted

51558

// during type legalization.

51559

// NOTE: The element count check is to ignore operand types that need to

51560

// go through type promotion to a 128-bit vector.

51561

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

51562

VT.getVectorElementType() == MVT::i1 &&

51563

(OpVT.getVectorElementType() == MVT::i8 ||

51564

OpVT.getVectorElementType() == MVT::i16)) {

51565

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

51566

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

51567

}

51568

51569

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

51570

// to avoid scalarization via legalization because v4i32 is not a legal type.

51571

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

51572

LHS.getValueType() == MVT::v4f32)

51573

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

51574

51575

// X pred 0.0 --> X pred -X

51576

// If the negation of X already exists, use it in the comparison. This removes

51577

// the need to materialize 0.0 and allows matching to SSE's MIN/MAX

51578

// instructions in patterns with a 'select' node.

51579

if (isNullFPScalarOrVectorConst(RHS)) {

51580

SDVTList FNegVT = DAG.getVTList(OpVT);

51581

if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

51582

return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

51583

}

51584

51585

return SDValue();

51586

}

51587

51588

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

51589

TargetLowering::DAGCombinerInfo &DCI,

51590

const X86Subtarget &Subtarget) {

51591

SDValue Src = N->getOperand(0);

51592

MVT SrcVT = Src.getSimpleValueType();

51593

MVT VT = N->getSimpleValueType(0);

51594

unsigned NumBits = VT.getScalarSizeInBits();

51595

unsigned NumElts = SrcVT.getVectorNumElements();

51596

51597

// Perform constant folding.

51598

if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {

51599

assert(VT == MVT::i32 && "Unexpected result type")(static_cast <bool> (VT == MVT::i32 && "Unexpected result type"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51599, __extension__
__PRETTY_FUNCTION__));

51600

APInt Imm(32, 0);

51601

for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {

51602

if (!Src.getOperand(Idx).isUndef() &&

51603

Src.getConstantOperandAPInt(Idx).isNegative())

51604

Imm.setBit(Idx);

51605

}

51606

return DAG.getConstant(Imm, SDLoc(N), VT);

51607

}

51608

51609

// Look through int->fp bitcasts that don't change the element width.

51610

unsigned EltWidth = SrcVT.getScalarSizeInBits();

51611

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

51612

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

51613

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

51614

51615

// Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

51616

// with scalar comparisons.

51617

if (SDValue NotSrc = IsNOT(Src, DAG)) {

51618

SDLoc DL(N);

51619

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

51620

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

51621

return DAG.getNode(ISD::XOR, DL, VT,

51622

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

51623

DAG.getConstant(NotMask, DL, VT));

51624

}

51625

51626

// Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

51627

// results with scalar comparisons.

51628

if (Src.getOpcode() == X86ISD::PCMPGT &&

51629

ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

51630

SDLoc DL(N);

51631

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

51632

return DAG.getNode(ISD::XOR, DL, VT,

51633

DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

51634

DAG.getConstant(NotMask, DL, VT));

51635

}

51636

51637

// Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

51638

// iff pow2splat(c1).

51639

if (Src.getOpcode() == X86ISD::PCMPEQ &&

51640

Src.getOperand(0).getOpcode() == ISD::AND &&

51641

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

51642

SDValue LHS = Src.getOperand(0).getOperand(0);

51643

SDValue RHS = Src.getOperand(0).getOperand(1);

51644

KnownBits KnownRHS = DAG.computeKnownBits(RHS);

51645

if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {

51646

SDLoc DL(N);

51647

MVT ShiftVT = SrcVT;

51648

if (ShiftVT.getScalarType() == MVT::i8) {

51649

// vXi8 shifts - we only care about the signbit so can use PSLLW.

51650

ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

51651

LHS = DAG.getBitcast(ShiftVT, LHS);

51652

}

51653

unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();

51654

LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,

51655

ShiftAmt, DAG);

51656

LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);

51657

return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);

51658

}

51659

}

51660

51661

// Simplify the inputs.

51662

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51663

APInt DemandedMask(APInt::getAllOnes(NumBits));

51664

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

51665

return SDValue(N, 0);

51666

51667

return SDValue();

51668

}

51669

51670

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

51671

TargetLowering::DAGCombinerInfo &DCI,

51672

const X86Subtarget &Subtarget) {

51673

auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

51674

SDValue BasePtr = MemOp->getBasePtr();

51675

SDValue Index = MemOp->getIndex();

51676

SDValue Scale = MemOp->getScale();

51677

SDValue Mask = MemOp->getMask();

51678

51679

// Attempt to fold an index scale into the scale value directly.

51680

// For smaller indices, implicit sext is performed BEFORE scale, preventing

51681

// this fold under most circumstances.

51682

// TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?

51683

if ((Index.getOpcode() == X86ISD::VSHLI ||

51684

(Index.getOpcode() == ISD::ADD &&

51685

Index.getOperand(0) == Index.getOperand(1))) &&

51686

isa<ConstantSDNode>(Scale) &&

51687

BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {

51688

unsigned ShiftAmt =

51689

Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);

51690

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

51691

uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);

51692

if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {

51693

SDValue NewIndex = Index.getOperand(0);

51694

SDValue NewScale =

51695

DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());

51696

if (N->getOpcode() == X86ISD::MGATHER)

51697

return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,

51698

MemOp->getOperand(1), Mask,

51699

MemOp->getBasePtr(), NewIndex, NewScale,

51700

MemOp->getChain(), Subtarget);

51701

if (N->getOpcode() == X86ISD::MSCATTER)

51702

return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,

51703

MemOp->getOperand(1), Mask, MemOp->getBasePtr(),

51704

NewIndex, NewScale, MemOp->getChain(), Subtarget);

51705

}

51706

}

51707

51708

// With vector masks we only demand the upper bit of the mask.

51709

if (Mask.getScalarValueSizeInBits() != 1) {

51710

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51711

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

51712

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

51713

if (N->getOpcode() != ISD::DELETED_NODE)

51714

DCI.AddToWorklist(N);

51715

return SDValue(N, 0);

51716

}

51717

}

51718

51719

return SDValue();

51720

}

51721

51722

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

51723

SDValue Index, SDValue Base, SDValue Scale,

51724

SelectionDAG &DAG) {

51725

SDLoc DL(GorS);

51726

51727

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

51728

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

51729

Gather->getMask(), Base, Index, Scale } ;

51730

return DAG.getMaskedGather(Gather->getVTList(),

51731

Gather->getMemoryVT(), DL, Ops,

51732

Gather->getMemOperand(),

51733

Gather->getIndexType(),

51734

Gather->getExtensionType());

51735

}

51736

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

51737

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

51738

Scatter->getMask(), Base, Index, Scale };

51739

return DAG.getMaskedScatter(Scatter->getVTList(),

51740

Scatter->getMemoryVT(), DL,

51741

Ops, Scatter->getMemOperand(),

51742

Scatter->getIndexType(),

51743

Scatter->isTruncatingStore());

51744

}

51745

51746

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

51747

TargetLowering::DAGCombinerInfo &DCI) {

51748

SDLoc DL(N);

51749

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

51750

SDValue Index = GorS->getIndex();

51751

SDValue Base = GorS->getBasePtr();

51752

SDValue Scale = GorS->getScale();

51753

51754

if (DCI.isBeforeLegalize()) {

51755

unsigned IndexWidth = Index.getScalarValueSizeInBits();

51756

51757

// Shrink constant indices if they are larger than 32-bits.

51758

// Only do this before legalize types since v2i64 could become v2i32.

51759

// FIXME: We could check that the type is legal if we're after legalize

51760

// types, but then we would need to construct test cases where that happens.

51761

// FIXME: We could support more than just constant vectors, but we need to

51762

// careful with costing. A truncate that can be optimized out would be fine.

51763

// Otherwise we might only want to create a truncate if it avoids a split.

51764

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

51765

if (BV->isConstant() && IndexWidth > 32 &&

51766

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

51767

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

51768

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

51769

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

51770

}

51771

}

51772

51773

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

51774

// there are sufficient sign bits. Only do this before legalize types to

51775

// avoid creating illegal types in truncate.

51776

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

51777

Index.getOpcode() == ISD::ZERO_EXTEND) &&

51778

IndexWidth > 32 &&

51779

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

51780

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

51781

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

51782

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

51783

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

51784

}

51785

}

51786

51787

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51788

EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51789

// Try to move splat constant adders from the index operand to the base

51790

// pointer operand. Taking care to multiply by the scale. We can only do

51791

// this when index element type is the same as the pointer type.

51792

// Otherwise we need to be sure the math doesn't wrap before the scale.

51793

if (Index.getOpcode() == ISD::ADD &&

51794

Index.getValueType().getVectorElementType() == PtrVT &&

51795

isa<ConstantSDNode>(Scale)) {

51796

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

51797

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {

51798

BitVector UndefElts;

51799

if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {

51800

// FIXME: Allow non-constant?

51801

if (UndefElts.none()) {

51802

// Apply the scale.

51803

APInt Adder = C->getAPIntValue() * ScaleAmt;

51804

// Add it to the existing base.

51805

Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

51806

DAG.getConstant(Adder, DL, PtrVT));

51807

Index = Index.getOperand(0);

51808

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

51809

}

51810

}

51811

51812

// It's also possible base is just a constant. In that case, just

51813

// replace it with 0 and move the displacement into the index.

51814

if (BV->isConstant() && isa<ConstantSDNode>(Base) &&

51815

isOneConstant(Scale)) {

51816

SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);

51817

// Combine the constant build_vector and the constant base.

51818

Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

51819

Index.getOperand(1), Splat);

51820

// Add to the LHS of the original Index add.

51821

Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

51822

Index.getOperand(0), Splat);

51823

Base = DAG.getConstant(0, DL, Base.getValueType());

51824

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

51825

}

51826

}

51827

}

51828

51829

if (DCI.isBeforeLegalizeOps()) {

51830

unsigned IndexWidth = Index.getScalarValueSizeInBits();

51831

51832

// Make sure the index is either i32 or i64

51833

if (IndexWidth != 32 && IndexWidth != 64) {

51834

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

51835

EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);

51836

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

51837

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

51838

}

51839

}

51840

51841

// With vector masks we only demand the upper bit of the mask.

51842

SDValue Mask = GorS->getMask();

51843

if (Mask.getScalarValueSizeInBits() != 1) {

51844

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51845

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

51846

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

51847

if (N->getOpcode() != ISD::DELETED_NODE)

51848

DCI.AddToWorklist(N);

51849

return SDValue(N, 0);

51850

}

51851

}

51852

51853

return SDValue();

51854

}

51855

51856

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

51857

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

51858

const X86Subtarget &Subtarget) {

51859

SDLoc DL(N);

51860

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

51861

SDValue EFLAGS = N->getOperand(1);

51862

51863

// Try to simplify the EFLAGS and condition code operands.

51864

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

51865

return getSETCC(CC, Flags, DL, DAG);

51866

51867

return SDValue();

51868

}

51869

51870

/// Optimize branch condition evaluation.

51871

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

51872

const X86Subtarget &Subtarget) {

51873

SDLoc DL(N);

51874

SDValue EFLAGS = N->getOperand(3);

51875

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

51876

51877

// Try to simplify the EFLAGS and condition code operands.

51878

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

51879

// RAUW them under us.

51880

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

51881

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

51882

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

51883

N->getOperand(1), Cond, Flags);

51884

}

51885

51886

return SDValue();

51887

}

51888

51889

// TODO: Could we move this to DAGCombine?

51890

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

51891

SelectionDAG &DAG) {

51892

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

51893

// to optimize away operation when it's from a constant.

51894

//

51895

// The general transformation is:

51896

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

51897

// AND(VECTOR_CMP(x,y), constant2)

51898

// constant2 = UNARYOP(constant)

51899

51900

// Early exit if this isn't a vector operation, the operand of the

51901

// unary operation isn't a bitwise AND, or if the sizes of the operations

51902

// aren't the same.

51903

EVT VT = N->getValueType(0);

51904

bool IsStrict = N->isStrictFPOpcode();

51905

unsigned NumEltBits = VT.getScalarSizeInBits();

51906

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

51907

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

51908

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

51909

VT.getSizeInBits() != Op0.getValueSizeInBits())

51910

return SDValue();

51911

51912

// Now check that the other operand of the AND is a constant. We could

51913

// make the transformation for non-constant splats as well, but it's unclear

51914

// that would be a benefit as it would not eliminate any operations, just

51915

// perform one more step in scalar code before moving to the vector unit.

51916

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

51917

// Bail out if the vector isn't a constant.

51918

if (!BV->isConstant())

51919

return SDValue();

51920

51921

// Everything checks out. Build up the new and improved node.

51922

SDLoc DL(N);

51923

EVT IntVT = BV->getValueType(0);

51924

// Create a new constant of the appropriate type for the transformed

51925

// DAG.

51926

SDValue SourceConst;

51927

if (IsStrict)

51928

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

51929

{N->getOperand(0), SDValue(BV, 0)});

51930

else

51931

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

51932

// The AND node needs bitcasts to/from an integer vector type around it.

51933

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

51934

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

51935

MaskConst);

51936

SDValue Res = DAG.getBitcast(VT, NewAnd);

51937

if (IsStrict)

51938

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

51939

return Res;

51940

}

51941

51942

return SDValue();

51943

}

51944

51945

/// If we are converting a value to floating-point, try to replace scalar

51946

/// truncate of an extracted vector element with a bitcast. This tries to keep

51947

/// the sequence on XMM registers rather than moving between vector and GPRs.

51948

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

51949

// TODO: This is currently only used by combineSIntToFP, but it is generalized

51950

// to allow being called by any similar cast opcode.

51951

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

51952

SDValue Trunc = N->getOperand(0);

51953

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

51954

return SDValue();

51955

51956

SDValue ExtElt = Trunc.getOperand(0);

51957

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

51958

!isNullConstant(ExtElt.getOperand(1)))

51959

return SDValue();

51960

51961

EVT TruncVT = Trunc.getValueType();

51962

EVT SrcVT = ExtElt.getValueType();

51963

unsigned DestWidth = TruncVT.getSizeInBits();

51964

unsigned SrcWidth = SrcVT.getSizeInBits();

51965

if (SrcWidth % DestWidth != 0)

51966

return SDValue();

51967

51968

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

51969

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

51970

unsigned VecWidth = SrcVecVT.getSizeInBits();

51971

unsigned NumElts = VecWidth / DestWidth;

51972

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

51973

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

51974

SDLoc DL(N);

51975

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

51976

BitcastVec, ExtElt.getOperand(1));

51977

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

51978

}

51979

51980

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

51981

const X86Subtarget &Subtarget) {

51982

bool IsStrict = N->isStrictFPOpcode();

51983

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

51984

EVT VT = N->getValueType(0);

51985

EVT InVT = Op0.getValueType();

51986

51987

// UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))

51988

// UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))

51989

// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))

51990

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

51991

unsigned ScalarSize = InVT.getScalarSizeInBits();

51992

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

51993

return SDValue();

51994

SDLoc dl(N);

51995

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

51996

ScalarSize < 16 ? MVT::i16

51997

: ScalarSize < 32 ? MVT::i32

51998

: MVT::i64,

51999

InVT.getVectorNumElements());

52000

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

52001

if (IsStrict)

52002

return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},

52003

{N->getOperand(0), P});

52004

return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

52005

}

52006

52007

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

52008

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

52009

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

52010

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

52011

VT.getScalarType() != MVT::f16) {

52012

SDLoc dl(N);

52013

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

52014

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

52015

52016

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

52017

if (IsStrict)

52018

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

52019

{N->getOperand(0), P});

52020

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

52021

}

52022

52023

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

52024

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

52025

// the optimization here.

52026

if (DAG.SignBitIsZero(Op0)) {

52027

if (IsStrict)

52028

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

52029

{N->getOperand(0), Op0});

52030

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

52031

}

52032

52033

return SDValue();

52034

}

52035

52036

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

52037

TargetLowering::DAGCombinerInfo &DCI,

52038

const X86Subtarget &Subtarget) {

52039

// First try to optimize away the conversion entirely when it's

52040

// conditionally from a constant. Vectors only.

52041

bool IsStrict = N->isStrictFPOpcode();

52042

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

52043

return Res;

52044

52045

// Now move on to more general possibilities.

52046

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

52047

EVT VT = N->getValueType(0);

52048

EVT InVT = Op0.getValueType();

52049

52050

// SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))

52051

// SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

52052

// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

52053

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

52054

unsigned ScalarSize = InVT.getScalarSizeInBits();

52055

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

52056

return SDValue();

52057

SDLoc dl(N);

52058

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

52059

ScalarSize < 16 ? MVT::i16

52060

: ScalarSize < 32 ? MVT::i32

52061

: MVT::i64,

52062

InVT.getVectorNumElements());

52063

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

52064

if (IsStrict)

52065

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

52066

{N->getOperand(0), P});

52067

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

52068

}

52069

52070

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

52071

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

52072

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

52073

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

52074

VT.getScalarType() != MVT::f16) {

52075

SDLoc dl(N);

52076

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

52077

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

52078

if (IsStrict)

52079

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

52080

{N->getOperand(0), P});

52081

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

52082

}

52083

52084

// Without AVX512DQ we only support i64 to float scalar conversion. For both

52085

// vectors and scalars, see if we know that the upper bits are all the sign

52086

// bit, in which case we can truncate the input to i32 and convert from that.

52087

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

52088

unsigned BitWidth = InVT.getScalarSizeInBits();

52089

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

52090

if (NumSignBits >= (BitWidth - 31)) {

52091

EVT TruncVT = MVT::i32;

52092

if (InVT.isVector())

52093

TruncVT = InVT.changeVectorElementType(TruncVT);

52094

SDLoc dl(N);

52095

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

52096

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

52097

if (IsStrict)

52098

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

52099

{N->getOperand(0), Trunc});

52100

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

52101

}

52102

// If we're after legalize and the type is v2i32 we need to shuffle and

52103

// use CVTSI2P.

52104

assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52104, __extension__
__PRETTY_FUNCTION__));

52105

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

52106

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

52107

{ 0, 2, -1, -1 });

52108

if (IsStrict)

52109

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

52110

{N->getOperand(0), Shuf});

52111

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

52112

}

52113

}

52114

52115

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

52116

// a 32-bit target where SSE doesn't support i64->FP operations.

52117

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

52118

Op0.getOpcode() == ISD::LOAD) {

52119

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

52120

52121

// This transformation is not supported if the result type is f16 or f128.

52122

if (VT == MVT::f16 || VT == MVT::f128)

52123

return SDValue();

52124

52125

// If we have AVX512DQ we can use packed conversion instructions unless

52126

// the VT is f80.

52127

if (Subtarget.hasDQI() && VT != MVT::f80)

52128

return SDValue();

52129

52130

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

52131

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

52132

std::pair<SDValue, SDValue> Tmp =

52133

Subtarget.getTargetLowering()->BuildFILD(

52134

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

52135

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

52136

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

52137

return Tmp.first;

52138

}

52139

}

52140

52141

if (IsStrict)

52142

return SDValue();

52143

52144

if (SDValue V = combineToFPTruncExtElt(N, DAG))

52145

return V;

52146

52147

return SDValue();

52148

}

52149

52150

static bool needCarryOrOverflowFlag(SDValue Flags) {

52151

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52151, __extension__
__PRETTY_FUNCTION__));

52152

52153

for (const SDNode *User : Flags->uses()) {

52154

X86::CondCode CC;

52155

switch (User->getOpcode()) {

52156

default:

52157

// Be conservative.

52158

return true;

52159

case X86ISD::SETCC:

52160

case X86ISD::SETCC_CARRY:

52161

CC = (X86::CondCode)User->getConstantOperandVal(0);

52162

break;

52163

case X86ISD::BRCOND:

52164

CC = (X86::CondCode)User->getConstantOperandVal(2);

52165

break;

52166

case X86ISD::CMOV:

52167

CC = (X86::CondCode)User->getConstantOperandVal(2);

52168

break;

52169

}

52170

52171

switch (CC) {

52172

default: break;

52173

case X86::COND_A: case X86::COND_AE:

52174

case X86::COND_B: case X86::COND_BE:

52175

case X86::COND_O: case X86::COND_NO:

52176

case X86::COND_G: case X86::COND_GE:

52177

case X86::COND_L: case X86::COND_LE:

52178

return true;

52179

}

52180

}

52181

52182

return false;

52183

}

52184

52185

static bool onlyZeroFlagUsed(SDValue Flags) {

52186

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52186, __extension__
__PRETTY_FUNCTION__));

52187

52188

for (const SDNode *User : Flags->uses()) {

52189

unsigned CCOpNo;

52190

switch (User->getOpcode()) {

52191

default:

52192

// Be conservative.

52193

return false;

52194

case X86ISD::SETCC: CCOpNo = 0; break;

52195

case X86ISD::SETCC_CARRY: CCOpNo = 0; break;

52196

case X86ISD::BRCOND: CCOpNo = 2; break;

52197

case X86ISD::CMOV: CCOpNo = 2; break;

52198

}

52199

52200

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

52201

if (CC != X86::COND_E && CC != X86::COND_NE)

52202

return false;

52203

}

52204

52205

return true;

52206

}

52207

52208

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

52209

// Only handle test patterns.

52210

if (!isNullConstant(N->getOperand(1)))

52211

return SDValue();

52212

52213

// If we have a CMP of a truncated binop, see if we can make a smaller binop

52214

// and use its flags directly.

52215

// TODO: Maybe we should try promoting compares that only use the zero flag

52216

// first if we can prove the upper bits with computeKnownBits?

52217

SDLoc dl(N);

52218

SDValue Op = N->getOperand(0);

52219

EVT VT = Op.getValueType();

52220

52221

// If we have a constant logical shift that's only used in a comparison

52222

// against zero turn it into an equivalent AND. This allows turning it into

52223

// a TEST instruction later.

52224

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

52225

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

52226

onlyZeroFlagUsed(SDValue(N, 0))) {

52227

unsigned BitWidth = VT.getSizeInBits();

52228

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

52229

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

52230

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

52231

APInt Mask = Op.getOpcode() == ISD::SRL

52232

? APInt::getHighBitsSet(BitWidth, MaskBits)

52233

: APInt::getLowBitsSet(BitWidth, MaskBits);

52234

if (Mask.isSignedIntN(32)) {

52235

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

52236

DAG.getConstant(Mask, dl, VT));

52237

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

52238

DAG.getConstant(0, dl, VT));

52239

}

52240

}

52241

}

52242

52243

// Peek through any zero-extend if we're only testing for a zero result.

52244

if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

52245

SDValue Src = Op.getOperand(0);

52246

EVT SrcVT = Src.getValueType();

52247

if (SrcVT.getScalarSizeInBits() >= 8 &&

52248

DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

52249

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

52250

DAG.getConstant(0, dl, SrcVT));

52251

}

52252

52253

// Look for a truncate.

52254

if (Op.getOpcode() != ISD::TRUNCATE)

52255

return SDValue();

52256

52257

SDValue Trunc = Op;

52258

Op = Op.getOperand(0);

52259

52260

// See if we can compare with zero against the truncation source,

52261

// which should help using the Z flag from many ops. Only do this for

52262

// i32 truncated op to prevent partial-reg compares of promoted ops.

52263

EVT OpVT = Op.getValueType();

52264

APInt UpperBits =

52265

APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

52266

if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

52267

onlyZeroFlagUsed(SDValue(N, 0))) {

52268

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

52269

DAG.getConstant(0, dl, OpVT));

52270

}

52271

52272

// After this the truncate and arithmetic op must have a single use.

52273

if (!Trunc.hasOneUse() || !Op.hasOneUse())

52274

return SDValue();

52275

52276

unsigned NewOpc;

52277

switch (Op.getOpcode()) {

52278

default: return SDValue();

52279

case ISD::AND:

52280

// Skip and with constant. We have special handling for and with immediate

52281

// during isel to generate test instructions.

52282

if (isa<ConstantSDNode>(Op.getOperand(1)))

52283

return SDValue();

52284

NewOpc = X86ISD::AND;

52285

break;

52286

case ISD::OR: NewOpc = X86ISD::OR; break;

52287

case ISD::XOR: NewOpc = X86ISD::XOR; break;

52288

case ISD::ADD:

52289

// If the carry or overflow flag is used, we can't truncate.

52290

if (needCarryOrOverflowFlag(SDValue(N, 0)))

52291

return SDValue();

52292

NewOpc = X86ISD::ADD;

52293

break;

52294

case ISD::SUB:

52295

// If the carry or overflow flag is used, we can't truncate.

52296

if (needCarryOrOverflowFlag(SDValue(N, 0)))

52297

return SDValue();

52298

NewOpc = X86ISD::SUB;

52299

break;

52300

}

52301

52302

// We found an op we can narrow. Truncate its inputs.

52303

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

52304

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

52305

52306

// Use a X86 specific opcode to avoid DAG combine messing with it.

52307

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

52308

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

52309

52310

// For AND, keep a CMP so that we can match the test pattern.

52311

if (NewOpc == X86ISD::AND)

52312

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

52313

DAG.getConstant(0, dl, VT));

52314

52315

// Return the flags.

52316

return Op.getValue(1);

52317

}

52318

52319

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

52320

TargetLowering::DAGCombinerInfo &DCI) {

52321

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52322, __extension__
__PRETTY_FUNCTION__))

52322

"Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52322, __extension__
__PRETTY_FUNCTION__));

52323

52324

SDLoc DL(N);

52325

SDValue LHS = N->getOperand(0);

52326

SDValue RHS = N->getOperand(1);

52327

MVT VT = LHS.getSimpleValueType();

52328

unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

52329

52330

// If we don't use the flag result, simplify back to a generic ADD/SUB.

52331

if (!N->hasAnyUseOfValue(1)) {

52332

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

52333

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

52334

}

52335

52336

// Fold any similar generic ADD/SUB opcodes to reuse this node.

52337

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

52338

SDValue Ops[] = {N0, N1};

52339

SDVTList VTs = DAG.getVTList(N->getValueType(0));

52340

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

52341

SDValue Op(N, 0);

52342

if (Negate)

52343

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

52344

DCI.CombineTo(GenericAddSub, Op);

52345

}

52346

};

52347

MatchGeneric(LHS, RHS, false);

52348

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

52349

52350

return SDValue();

52351

}

52352

52353

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

52354

SDValue LHS = N->getOperand(0);

52355

SDValue RHS = N->getOperand(1);

52356

SDValue BorrowIn = N->getOperand(2);

52357

52358

if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

52359

MVT VT = N->getSimpleValueType(0);

52360

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

52361

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

52362

}

52363

52364

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

52365

// iff the flag result is dead.

52366

if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

52367

!N->hasAnyUseOfValue(1))

52368

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

52369

LHS.getOperand(1), BorrowIn);

52370

52371

return SDValue();

52372

}

52373

52374

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

52375

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

52376

TargetLowering::DAGCombinerInfo &DCI) {

52377

SDValue LHS = N->getOperand(0);

52378

SDValue RHS = N->getOperand(1);

52379

SDValue CarryIn = N->getOperand(2);

52380

auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

52381

auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

52382

52383

// Canonicalize constant to RHS.

52384

if (LHSC && !RHSC)

52385

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

52386

CarryIn);

52387

52388

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

52389

// the result is either zero or one (depending on the input carry bit).

52390

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

52391

if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

52392

// We don't have a good way to replace an EFLAGS use, so only do this when

52393

// dead right now.

52394

SDValue(N, 1).use_empty()) {

52395

SDLoc DL(N);

52396

EVT VT = N->getValueType(0);

52397

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

52398

SDValue Res1 = DAG.getNode(

52399

ISD::AND, DL, VT,

52400

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

52401

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

52402

DAG.getConstant(1, DL, VT));

52403

return DCI.CombineTo(N, Res1, CarryOut);

52404

}

52405

52406

// Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

52407

// iff the flag result is dead.

52408

// TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

52409

if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

52410

SDLoc DL(N);

52411

APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

52412

return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

52413

DAG.getConstant(0, DL, LHS.getValueType()),

52414

DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

52415

}

52416

52417

if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

52418

MVT VT = N->getSimpleValueType(0);

52419

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

52420

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

52421

}

52422

52423

// Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

52424

// iff the flag result is dead.

52425

if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

52426

!N->hasAnyUseOfValue(1))

52427

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

52428

LHS.getOperand(1), CarryIn);

52429

52430

return SDValue();

52431

}

52432

52433

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

52434

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

52435

/// with CMP+{ADC, SBB}.

52436

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.

52437

static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

52438

SDValue X, SDValue Y,

52439

SelectionDAG &DAG) {

52440

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

52441

return SDValue();

52442

52443

// Look through a one-use zext.

52444

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

52445

Y = Y.getOperand(0);

52446

52447

X86::CondCode CC;

52448

SDValue EFLAGS;

52449

if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

52450

CC = (X86::CondCode)Y.getConstantOperandVal(0);

52451

EFLAGS = Y.getOperand(1);

52452

} else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

52453

Y.hasOneUse()) {

52454

EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

52455

}

52456

52457

if (!EFLAGS)

52458

return SDValue();

52459

52460

// If X is -1 or 0, then we have an opportunity to avoid constants required in

52461

// the general case below.

52462

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

52463

if (ConstantX) {

52464

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

52465

(IsSub && CC == X86::COND_B && ConstantX->isZero())) {

52466

// This is a complicated way to get -1 or 0 from the carry flag:

52467

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

52468

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

52469

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

52470

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

52471

EFLAGS);

52472

}

52473

52474

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

52475

(IsSub && CC == X86::COND_A && ConstantX->isZero())) {

52476

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

52477

EFLAGS.getValueType().isInteger() &&

52478

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

52479

// Swap the operands of a SUB, and we have the same pattern as above.

52480

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

52481

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

52482

SDValue NewSub = DAG.getNode(

52483

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

52484

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

52485

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

52486

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

52487

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

52488

NewEFLAGS);

52489

}

52490

}

52491

}

52492

52493

if (CC == X86::COND_B) {

52494

// X + SETB Z --> adc X, 0

52495

// X - SETB Z --> sbb X, 0

52496

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

52497

DAG.getVTList(VT, MVT::i32), X,

52498

DAG.getConstant(0, DL, VT), EFLAGS);

52499

}

52500

52501

if (CC == X86::COND_A) {

52502

// Try to convert COND_A into COND_B in an attempt to facilitate

52503

// materializing "setb reg".

52504

//

52505

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

52506

// cannot take an immediate as its first operand.

52507

//

52508

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

52509

EFLAGS.getValueType().isInteger() &&

52510

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

52511

SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),

52512

EFLAGS.getNode()->getVTList(),

52513

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

52514

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

52515

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

52516

DAG.getVTList(VT, MVT::i32), X,

52517

DAG.getConstant(0, DL, VT), NewEFLAGS);

52518

}

52519

}

52520

52521

if (CC == X86::COND_AE) {

52522

// X + SETAE --> sbb X, -1

52523

// X - SETAE --> adc X, -1

52524

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

52525

DAG.getVTList(VT, MVT::i32), X,

52526

DAG.getConstant(-1, DL, VT), EFLAGS);

52527

}

52528

52529

if (CC == X86::COND_BE) {

52530

// X + SETBE --> sbb X, -1

52531

// X - SETBE --> adc X, -1

52532

// Try to convert COND_BE into COND_AE in an attempt to facilitate

52533

// materializing "setae reg".

52534

//

52535

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

52536

// cannot take an immediate as its first operand.

52537

//

52538

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

52539

EFLAGS.getValueType().isInteger() &&

52540

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

52541

SDValue NewSub = DAG.getNode(

52542

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

52543

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

52544

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

52545

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

52546

DAG.getVTList(VT, MVT::i32), X,

52547

DAG.getConstant(-1, DL, VT), NewEFLAGS);

52548

}

52549

}

52550

52551

if (CC != X86::COND_E && CC != X86::COND_NE)

52552

return SDValue();

52553

52554

if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

52555

!X86::isZeroNode(EFLAGS.getOperand(1)) ||

52556

!EFLAGS.getOperand(0).getValueType().isInteger())

52557

return SDValue();

52558

52559

SDValue Z = EFLAGS.getOperand(0);

52560

EVT ZVT = Z.getValueType();

52561

52562

// If X is -1 or 0, then we have an opportunity to avoid constants required in

52563

// the general case below.

52564

if (ConstantX) {

52565

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

52566

// fake operands:

52567

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

52568

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

52569

if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

52570

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

52571

SDValue Zero = DAG.getConstant(0, DL, ZVT);

52572

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

52573

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

52574

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

52575

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

52576

SDValue(Neg.getNode(), 1));

52577

}

52578

52579

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

52580

// with fake operands:

52581

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

52582

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

52583

if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

52584

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

52585

SDValue One = DAG.getConstant(1, DL, ZVT);

52586

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

52587

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

52588

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

52589

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

52590

Cmp1.getValue(1));

52591

}

52592

}

52593

52594

// (cmp Z, 1) sets the carry flag if Z is 0.

52595

SDValue One = DAG.getConstant(1, DL, ZVT);

52596

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

52597

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

52598

52599

// Add the flags type for ADC/SBB nodes.

52600

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

52601

52602

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

52603

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

52604

if (CC == X86::COND_NE)

52605

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

52606

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

52607

52608

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

52609

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

52610

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

52611

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

52612

}

52613

52614

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

52615

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

52616

/// with CMP+{ADC, SBB}.

52617

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

52618

bool IsSub = N->getOpcode() == ISD::SUB;

52619

SDValue X = N->getOperand(0);

52620

SDValue Y = N->getOperand(1);

52621

EVT VT = N->getValueType(0);

52622

SDLoc DL(N);

52623

52624

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

52625

return ADCOrSBB;

52626

52627

// Commute and try again (negate the result for subtracts).

52628

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

52629

if (IsSub)

52630

ADCOrSBB =

52631

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);

52632

return ADCOrSBB;

52633

}

52634

52635

return SDValue();

52636

}

52637

52638

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

52639

const SDLoc &DL, EVT VT,

52640

const X86Subtarget &Subtarget) {

52641

// Example of pattern we try to detect:

52642

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

52643

//(add (build_vector (extract_elt t, 0),

52644

// (extract_elt t, 2),

52645

// (extract_elt t, 4),

52646

// (extract_elt t, 6)),

52647

// (build_vector (extract_elt t, 1),

52648

// (extract_elt t, 3),

52649

// (extract_elt t, 5),

52650

// (extract_elt t, 7)))

52651

52652

if (!Subtarget.hasSSE2())

52653

return SDValue();

52654

52655

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

52656

Op1.getOpcode() != ISD::BUILD_VECTOR)

52657

return SDValue();

52658

52659

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

52660

VT.getVectorNumElements() < 4 ||

52661

!isPowerOf2_32(VT.getVectorNumElements()))

52662

return SDValue();

52663

52664

// Check if one of Op0,Op1 is of the form:

52665

// (build_vector (extract_elt Mul, 0),

52666

// (extract_elt Mul, 2),

52667

// (extract_elt Mul, 4),

52668

// ...

52669

// the other is of the form:

52670

// (build_vector (extract_elt Mul, 1),

52671

// (extract_elt Mul, 3),

52672

// (extract_elt Mul, 5),

52673

// ...

52674

// and identify Mul.

52675

SDValue Mul;

52676

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

52677

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

52678

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

52679

// TODO: Be more tolerant to undefs.

52680

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52681

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52682

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52683

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

52684

return SDValue();

52685

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

52686

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

52687

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

52688

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

52689

if (!Const0L || !Const1L || !Const0H || !Const1H)

52690

return SDValue();

52691

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

52692

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

52693

// Commutativity of mul allows factors of a product to reorder.

52694

if (Idx0L > Idx1L)

52695

std::swap(Idx0L, Idx1L);

52696

if (Idx0H > Idx1H)

52697

std::swap(Idx0H, Idx1H);

52698

// Commutativity of add allows pairs of factors to reorder.

52699

if (Idx0L > Idx0H) {

52700

std::swap(Idx0L, Idx0H);

52701

std::swap(Idx1L, Idx1H);

52702

}

52703

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

52704

Idx1H != 2 * i + 3)

52705

return SDValue();

52706

if (!Mul) {

52707

// First time an extract_elt's source vector is visited. Must be a MUL

52708

// with 2X number of vector elements than the BUILD_VECTOR.

52709

// Both extracts must be from same MUL.

52710

Mul = Op0L->getOperand(0);

52711

if (Mul->getOpcode() != ISD::MUL ||

52712

Mul.getValueType().getVectorNumElements() != 2 * e)

52713

return SDValue();

52714

}

52715

// Check that the extract is from the same MUL previously seen.

52716

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

52717

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

52718

return SDValue();

52719

}

52720

52721

// Check if the Mul source can be safely shrunk.

52722

ShrinkMode Mode;

52723

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

52724

Mode == ShrinkMode::MULU16)

52725

return SDValue();

52726

52727

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52728

VT.getVectorNumElements() * 2);

52729

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

52730

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

52731

52732

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

52733

ArrayRef<SDValue> Ops) {

52734

EVT InVT = Ops[0].getValueType();

52735

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52735, __extension__
__PRETTY_FUNCTION__));

52736

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

52737

InVT.getVectorNumElements() / 2);

52738

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

52739

};

52740

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

52741

}

52742

52743

// Attempt to turn this pattern into PMADDWD.

52744

// (add (mul (sext (build_vector)), (sext (build_vector))),

52745

// (mul (sext (build_vector)), (sext (build_vector)))

52746

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

52747

const SDLoc &DL, EVT VT,

52748

const X86Subtarget &Subtarget) {

52749

if (!Subtarget.hasSSE2())

52750

return SDValue();

52751

52752

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

52753

return SDValue();

52754

52755

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

52756

VT.getVectorNumElements() < 4 ||

52757

!isPowerOf2_32(VT.getVectorNumElements()))

52758

return SDValue();

52759

52760

SDValue N00 = N0.getOperand(0);

52761

SDValue N01 = N0.getOperand(1);

52762

SDValue N10 = N1.getOperand(0);

52763

SDValue N11 = N1.getOperand(1);

52764

52765

// All inputs need to be sign extends.

52766

// TODO: Support ZERO_EXTEND from known positive?

52767

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

52768

N01.getOpcode() != ISD::SIGN_EXTEND ||

52769

N10.getOpcode() != ISD::SIGN_EXTEND ||

52770

N11.getOpcode() != ISD::SIGN_EXTEND)

52771

return SDValue();

52772

52773

// Peek through the extends.

52774

N00 = N00.getOperand(0);

52775

N01 = N01.getOperand(0);

52776

N10 = N10.getOperand(0);

52777

N11 = N11.getOperand(0);

52778

52779

// Must be extending from vXi16.

52780

EVT InVT = N00.getValueType();

52781

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

52782

N10.getValueType() != InVT || N11.getValueType() != InVT)

52783

return SDValue();

52784

52785

// All inputs should be build_vectors.

52786

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

52787

N01.getOpcode() != ISD::BUILD_VECTOR ||

52788

N10.getOpcode() != ISD::BUILD_VECTOR ||

52789

N11.getOpcode() != ISD::BUILD_VECTOR)

52790

return SDValue();

52791

52792

// For each element, we need to ensure we have an odd element from one vector

52793

// multiplied by the odd element of another vector and the even element from

52794

// one of the same vectors being multiplied by the even element from the

52795

// other vector. So we need to make sure for each element i, this operator

52796

// is being performed:

52797

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

52798

SDValue In0, In1;

52799

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

52800

SDValue N00Elt = N00.getOperand(i);

52801

SDValue N01Elt = N01.getOperand(i);

52802

SDValue N10Elt = N10.getOperand(i);

52803

SDValue N11Elt = N11.getOperand(i);

52804

// TODO: Be more tolerant to undefs.

52805

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52806

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52807

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52808

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

52809

return SDValue();

52810

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

52811

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

52812

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

52813

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

52814

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

52815

return SDValue();

52816

unsigned IdxN00 = ConstN00Elt->getZExtValue();

52817

unsigned IdxN01 = ConstN01Elt->getZExtValue();

52818

unsigned IdxN10 = ConstN10Elt->getZExtValue();

52819

unsigned IdxN11 = ConstN11Elt->getZExtValue();

52820

// Add is commutative so indices can be reordered.

52821

if (IdxN00 > IdxN10) {

52822

std::swap(IdxN00, IdxN10);

52823

std::swap(IdxN01, IdxN11);

52824

}

52825

// N0 indices be the even element. N1 indices must be the next odd element.

52826

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

52827

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

52828

return SDValue();

52829

SDValue N00In = N00Elt.getOperand(0);

52830

SDValue N01In = N01Elt.getOperand(0);

52831

SDValue N10In = N10Elt.getOperand(0);

52832

SDValue N11In = N11Elt.getOperand(0);

52833

52834

// First time we find an input capture it.

52835

if (!In0) {

52836

In0 = N00In;

52837

In1 = N01In;

52838

52839

// The input vectors must be at least as wide as the output.

52840

// If they are larger than the output, we extract subvector below.

52841

if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

52842

In1.getValueSizeInBits() < VT.getSizeInBits())

52843

return SDValue();

52844

}

52845

// Mul is commutative so the input vectors can be in any order.

52846

// Canonicalize to make the compares easier.

52847

if (In0 != N00In)

52848

std::swap(N00In, N01In);

52849

if (In0 != N10In)

52850

std::swap(N10In, N11In);

52851

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

52852

return SDValue();

52853

}

52854

52855

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

52856

ArrayRef<SDValue> Ops) {

52857

EVT OpVT = Ops[0].getValueType();

52858

assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52859, __extension__
__PRETTY_FUNCTION__))

52859

"Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52859, __extension__
__PRETTY_FUNCTION__));

52860

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52860, __extension__
__PRETTY_FUNCTION__));

52861

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

52862

OpVT.getVectorNumElements() / 2);

52863

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

52864

};

52865

52866

// If the output is narrower than an input, extract the low part of the input

52867

// vector.

52868

EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52869

VT.getVectorNumElements() * 2);

52870

if (OutVT16.bitsLT(In0.getValueType())) {

52871

In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

52872

DAG.getIntPtrConstant(0, DL));

52873

}

52874

if (OutVT16.bitsLT(In1.getValueType())) {

52875

In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

52876

DAG.getIntPtrConstant(0, DL));

52877

}

52878

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

52879

PMADDBuilder);

52880

}

52881

52882

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

52883

// If upper element in each pair of both VPMADDWD are zero then we can merge

52884

// the operand elements and use the implicit add of VPMADDWD.

52885

// TODO: Add support for VPMADDUBSW (which isn't commutable).

52886

static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

52887

const SDLoc &DL, EVT VT) {

52888

if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

52889

return SDValue();

52890

52891

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

52892

if (VT.getSizeInBits() > 128)

52893

return SDValue();

52894

52895

unsigned NumElts = VT.getVectorNumElements();

52896

MVT OpVT = N0.getOperand(0).getSimpleValueType();

52897

APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

52898

APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

52899

52900

bool Op0HiZero =

52901

DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

52902

DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

52903

bool Op1HiZero =

52904

DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

52905

DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

52906

52907

// TODO: Check for zero lower elements once we have actual codegen that

52908

// creates them.

52909

if (!Op0HiZero || !Op1HiZero)

52910

return SDValue();

52911

52912

// Create a shuffle mask packing the lower elements from each VPMADDWD.

52913

SmallVector<int> Mask;

52914

for (int i = 0; i != (int)NumElts; ++i) {

52915

Mask.push_back(2 * i);

52916

Mask.push_back(2 * (i + NumElts));

52917

}

52918

52919

SDValue LHS =

52920

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

52921

SDValue RHS =

52922

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

52923

return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

52924

}

52925

52926

/// CMOV of constants requires materializing constant operands in registers.

52927

/// Try to fold those constants into an 'add' instruction to reduce instruction

52928

/// count. We do this with CMOV rather the generic 'select' because there are

52929

/// earlier folds that may be used to turn select-of-constants into logic hacks.

52930

static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,

52931

const X86Subtarget &Subtarget) {

52932

// If an operand is zero, add-of-0 gets simplified away, so that's clearly

52933

// better because we eliminate 1-2 instructions. This transform is still

52934

// an improvement without zero operands because we trade 2 move constants and

52935

// 1 add for 2 adds (LEA) as long as the constants can be represented as

52936

// immediate asm operands (fit in 32-bits).

52937

auto isSuitableCmov = [](SDValue V) {

52938

if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

52939

return false;

52940

if (!isa<ConstantSDNode>(V.getOperand(0)) ||

52941

!isa<ConstantSDNode>(V.getOperand(1)))

52942

return false;

52943

return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

52944

(V.getConstantOperandAPInt(0).isSignedIntN(32) &&

52945

V.getConstantOperandAPInt(1).isSignedIntN(32));

52946

};

52947

52948

// Match an appropriate CMOV as the first operand of the add.

52949

SDValue Cmov = N->getOperand(0);

52950

SDValue OtherOp = N->getOperand(1);

52951

if (!isSuitableCmov(Cmov))

52952

std::swap(Cmov, OtherOp);

52953

if (!isSuitableCmov(Cmov))

52954

return SDValue();

52955

52956

// Don't remove a load folding opportunity for the add. That would neutralize

52957

// any improvements from removing constant materializations.

52958

if (X86::mayFoldLoad(OtherOp, Subtarget))

52959

return SDValue();

52960

52961

EVT VT = N->getValueType(0);

52962

SDLoc DL(N);

52963

SDValue FalseOp = Cmov.getOperand(0);

52964

SDValue TrueOp = Cmov.getOperand(1);

52965

52966

// We will push the add through the select, but we can potentially do better

52967

// if we know there is another add in the sequence and this is pointer math.

52968

// In that case, we can absorb an add into the trailing memory op and avoid

52969

// a 3-operand LEA which is likely slower than a 2-operand LEA.

52970

// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

52971

if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

52972

!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

52973

all_of(N->uses(), [&](SDNode *Use) {

52974

auto *MemNode = dyn_cast<MemSDNode>(Use);

52975

return MemNode && MemNode->getBasePtr().getNode() == N;

52976

})) {

52977

// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

52978

// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

52979

// it is possible that choosing op1 might be better.

52980

SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

52981

FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

52982

TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

52983

Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

52984

Cmov.getOperand(2), Cmov.getOperand(3));

52985

return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

52986

}

52987

52988

// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

52989

FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

52990

TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

52991

return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

52992

Cmov.getOperand(3));

52993

}

52994

52995

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

52996

TargetLowering::DAGCombinerInfo &DCI,

52997

const X86Subtarget &Subtarget) {

52998

EVT VT = N->getValueType(0);

52999

SDValue Op0 = N->getOperand(0);

53000

SDValue Op1 = N->getOperand(1);

53001

SDLoc DL(N);

53002

53003

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))

53004

return Select;

53005

53006

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))

53007

return MAdd;

53008

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))

53009

return MAdd;

53010

if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

53011

return MAdd;

53012

53013

// Try to synthesize horizontal adds from adds of shuffles.

53014

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

53015

return V;

53016

53017

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

53018

// (sub Y, (sext (vXi1 X))).

53019

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

53020

// generic DAG combine without a legal type check, but adding this there

53021

// caused regressions.

53022

if (VT.isVector()) {

53023

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53024

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

53025

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

53026

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

53027

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

53028

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

53029

}

53030

53031

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

53032

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

53033

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

53034

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

53035

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

53036

}

53037

}

53038

53039

// Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

53040

if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

53041

X86::isZeroNode(Op0.getOperand(1))) {

53042

assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53042, __extension__
__PRETTY_FUNCTION__));

53043

return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

53044

Op0.getOperand(0), Op0.getOperand(2));

53045

}

53046

53047

return combineAddOrSubToADCOrSBB(N, DAG);

53048

}

53049

53050

// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

53051

// condition comes from the subtract node that produced -X. This matches the

53052

// cmov expansion for absolute value. By swapping the operands we convert abs

53053

// to nabs.

53054

static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {

53055

SDValue N0 = N->getOperand(0);

53056

SDValue N1 = N->getOperand(1);

53057

53058

if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

53059

return SDValue();

53060

53061

X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);

53062

if (CC != X86::COND_S && CC != X86::COND_NS)

53063

return SDValue();

53064

53065

// Condition should come from a negate operation.

53066

SDValue Cond = N1.getOperand(3);

53067

if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))

53068

return SDValue();

53069

assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53069, __extension__
__PRETTY_FUNCTION__));

53070

53071

// Get the X and -X from the negate.

53072

SDValue NegX = Cond.getValue(0);

53073

SDValue X = Cond.getOperand(1);

53074

53075

SDValue FalseOp = N1.getOperand(0);

53076

SDValue TrueOp = N1.getOperand(1);

53077

53078

// Cmov operands should be X and NegX. Order doesn't matter.

53079

if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

53080

return SDValue();

53081

53082

// Build a new CMOV with the operands swapped.

53083

SDLoc DL(N);

53084

MVT VT = N->getSimpleValueType(0);

53085

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

53086

N1.getOperand(2), Cond);

53087

// Convert sub to add.

53088

return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

53089

}

53090

53091

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

53092

TargetLowering::DAGCombinerInfo &DCI,

53093

const X86Subtarget &Subtarget) {

53094

SDValue Op0 = N->getOperand(0);

53095

SDValue Op1 = N->getOperand(1);

53096

53097

// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.

53098

auto IsNonOpaqueConstant = [&](SDValue Op) {

53099

if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {

53100

if (auto *Cst = dyn_cast<ConstantSDNode>(C))

53101

return !Cst->isOpaque();

53102

return true;

53103

}

53104

return false;

53105

};

53106

53107

// X86 can't encode an immediate LHS of a sub. See if we can push the

53108

// negation into a preceding instruction. If the RHS of the sub is a XOR with

53109

// one use and a constant, invert the immediate, saving one register.

53110

// sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

53111

if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

53112

IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {

53113

SDLoc DL(N);

53114

EVT VT = Op0.getValueType();

53115

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

53116

DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

53117

SDValue NewAdd =

53118

DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

53119

return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

53120

}

53121

53122

if (SDValue V = combineSubABS(N, DAG))

53123

return V;

53124

53125

// Try to synthesize horizontal subs from subs of shuffles.

53126

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

53127

return V;

53128

53129

// Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

53130

if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

53131

X86::isZeroNode(Op1.getOperand(1))) {

53132

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53132, __extension__
__PRETTY_FUNCTION__));

53133

return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

53134

Op1.getOperand(0), Op1.getOperand(2));

53135

}

53136

53137

// Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

53138

// Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

53139

if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

53140

!(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

53141

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53141, __extension__
__PRETTY_FUNCTION__));

53142

SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

53143

Op1.getOperand(1), Op1.getOperand(2));

53144

return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),

53145

Op1.getOperand(0));

53146

}

53147

53148

return combineAddOrSubToADCOrSBB(N, DAG);

53149

}

53150

53151

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

53152

const X86Subtarget &Subtarget) {

53153

MVT VT = N->getSimpleValueType(0);

53154

SDLoc DL(N);

53155

53156

if (N->getOperand(0) == N->getOperand(1)) {

53157

if (N->getOpcode() == X86ISD::PCMPEQ)

53158

return DAG.getConstant(-1, DL, VT);

53159

if (N->getOpcode() == X86ISD::PCMPGT)

53160

return DAG.getConstant(0, DL, VT);

53161

}

53162

53163

return SDValue();

53164

}

53165

53166

/// Helper that combines an array of subvector ops as if they were the operands

53167

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

53168

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

53169

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

53170

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

53171

TargetLowering::DAGCombinerInfo &DCI,

53172

const X86Subtarget &Subtarget) {

53173

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53173, __extension__
__PRETTY_FUNCTION__));

53174

unsigned EltSizeInBits = VT.getScalarSizeInBits();

53175

53176

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

53177

return DAG.getUNDEF(VT);

53178

53179

if (llvm::all_of(Ops, [](SDValue Op) {

53180

return ISD::isBuildVectorAllZeros(Op.getNode());

53181

}))

53182

return getZeroVector(VT, Subtarget, DAG, DL);

53183

53184

SDValue Op0 = Ops[0];

53185

bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

53186

53187

// Repeated subvectors.

53188

if (IsSplat &&

53189

(VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

53190

// If this broadcast is inserted into both halves, use a larger broadcast.

53191

if (Op0.getOpcode() == X86ISD::VBROADCAST)

53192

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

53193

53194

// If this simple subvector or scalar/subvector broadcast_load is inserted

53195

// into both halves, use a larger broadcast_load. Update other uses to use

53196

// an extracted subvector.

53197

if (ISD::isNormalLoad(Op0.getNode()) ||

53198

Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

53199

Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

53200

auto *Mem = cast<MemSDNode>(Op0);

53201

unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

53202

? X86ISD::VBROADCAST_LOAD

53203

: X86ISD::SUBV_BROADCAST_LOAD;

53204

if (SDValue BcastLd =

53205

getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

53206

SDValue BcastSrc =

53207

extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

53208

DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

53209

return BcastLd;

53210

}

53211

}

53212

53213

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

53214

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

53215

(Subtarget.hasAVX2() ||

53216

X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

53217

VT.getScalarType(), Subtarget)))

53218

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

53219

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

53220

Op0.getOperand(0),

53221

DAG.getIntPtrConstant(0, DL)));

53222

53223

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

53224

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

53225

(Subtarget.hasAVX2() ||

53226

(EltSizeInBits >= 32 &&

53227

X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

53228

Op0.getOperand(0).getValueType() == VT.getScalarType())

53229

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

53230

53231

// concat_vectors(extract_subvector(broadcast(x)),

53232

// extract_subvector(broadcast(x))) -> broadcast(x)

53233

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

53234

Op0.getOperand(0).getValueType() == VT) {

53235

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

53236

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

53237

return Op0.getOperand(0);

53238

}

53239

}

53240

53241

// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.

53242

// Only concat of subvector high halves which vperm2x128 is best at.

53243

// TODO: This should go in combineX86ShufflesRecursively eventually.

53244

if (VT.is256BitVector() && Ops.size() == 2) {

53245

SDValue Src0 = peekThroughBitcasts(Ops[0]);

53246

SDValue Src1 = peekThroughBitcasts(Ops[1]);

53247

if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

53248

Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

53249

EVT SrcVT0 = Src0.getOperand(0).getValueType();

53250

EVT SrcVT1 = Src1.getOperand(0).getValueType();

53251

unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

53252

unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

53253

if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&

53254

Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&

53255

Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {

53256

return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

53257

DAG.getBitcast(VT, Src0.getOperand(0)),

53258

DAG.getBitcast(VT, Src1.getOperand(0)),

53259

DAG.getTargetConstant(0x31, DL, MVT::i8));

53260

}

53261

}

53262

}

53263

53264

// Repeated opcode.

53265

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

53266

// but it currently struggles with different vector widths.

53267

if (llvm::all_of(Ops, [Op0](SDValue Op) {

53268

return Op.getOpcode() == Op0.getOpcode();

53269

})) {

53270

auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

53271

SmallVector<SDValue> Subs;

53272

for (SDValue SubOp : SubOps)

53273

Subs.push_back(SubOp.getOperand(I));

53274

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

53275

};

53276

53277

unsigned NumOps = Ops.size();

53278

switch (Op0.getOpcode()) {

53279

case X86ISD::VBROADCAST: {

53280

if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) {

53281

return Op.getOperand(0).getValueType().is128BitVector();

53282

}))

53283

return DAG.getNode(X86ISD::MOVDDUP, DL, VT,

53284

ConcatSubOperand(VT, Ops, 0));

53285

break;

53286

}

53287

case X86ISD::MOVDDUP:

53288

case X86ISD::MOVSHDUP:

53289

case X86ISD::MOVSLDUP: {

53290

if (!IsSplat)

53291

return DAG.getNode(Op0.getOpcode(), DL, VT,

53292

ConcatSubOperand(VT, Ops, 0));

53293

break;

53294

}

53295

case X86ISD::SHUFP: {

53296

// Add SHUFPD support if/when necessary.

53297

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

53298

llvm::all_of(Ops, [Op0](SDValue Op) {

53299

return Op.getOperand(2) == Op0.getOperand(2);

53300

})) {

53301

return DAG.getNode(Op0.getOpcode(), DL, VT,

53302

ConcatSubOperand(VT, Ops, 0),

53303

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

53304

}

53305

break;

53306

}

53307

case X86ISD::PSHUFHW:

53308

case X86ISD::PSHUFLW:

53309

case X86ISD::PSHUFD:

53310

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

53311

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

53312

return DAG.getNode(Op0.getOpcode(), DL, VT,

53313

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

53314

}

53315

LLVM_FALLTHROUGH[[gnu::fallthrough]];

53316

case X86ISD::VPERMILPI:

53317

if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&

53318

Op0.getOperand(1) == Ops[1].getOperand(1)) {

53319

SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));

53320

Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,

53321

Op0.getOperand(1));

53322

return DAG.getBitcast(VT, Res);

53323

}

53324

if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {

53325

uint64_t Idx0 = Ops[0].getConstantOperandVal(1);

53326

uint64_t Idx1 = Ops[1].getConstantOperandVal(1);

53327

uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);

53328

return DAG.getNode(Op0.getOpcode(), DL, VT,

53329

ConcatSubOperand(VT, Ops, 0),

53330

DAG.getTargetConstant(Idx, DL, MVT::i8));

53331

}

53332

break;

53333

case X86ISD::PSHUFB:

53334

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

53335

(VT.is512BitVector() && Subtarget.useBWIRegs()))) {

53336

return DAG.getNode(Op0.getOpcode(), DL, VT,

53337

ConcatSubOperand(VT, Ops, 0),

53338

ConcatSubOperand(VT, Ops, 1));

53339

}

53340

break;

53341

case X86ISD::VPERMV3:

53342

if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

53343

MVT OpVT = Op0.getSimpleValueType();

53344

int NumSrcElts = OpVT.getVectorNumElements();

53345

SmallVector<int, 64> ConcatMask;

53346

for (unsigned i = 0; i != NumOps; ++i) {

53347

SmallVector<int, 64> SubMask;

53348

SmallVector<SDValue, 2> SubOps;

53349

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

53350

SubMask))

53351

break;

53352

for (int M : SubMask) {

53353

if (0 <= M) {

53354

M += M < NumSrcElts ? 0 : NumSrcElts;

53355

M += i * NumSrcElts;

53356

}

53357

ConcatMask.push_back(M);

53358

}

53359

}

53360

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

53361

SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),

53362

Ops[1].getOperand(0), DAG, DL);

53363

SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),

53364

Ops[1].getOperand(2), DAG, DL);

53365

MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());

53366

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

53367

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

53368

return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);

53369

}

53370

}

53371

break;

53372

case X86ISD::VSHLI:

53373

case X86ISD::VSRLI:

53374

// Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

53375

// TODO: Move this to LowerShiftByScalarImmediate?

53376

if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

53377

llvm::all_of(Ops, [](SDValue Op) {

53378

return Op.getConstantOperandAPInt(1) == 32;

53379

})) {

53380

SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));

53381

SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

53382

if (Op0.getOpcode() == X86ISD::VSHLI) {

53383

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

53384

{8, 0, 8, 2, 8, 4, 8, 6});

53385

} else {

53386

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

53387

{1, 8, 3, 8, 5, 8, 7, 8});

53388

}

53389

return DAG.getBitcast(VT, Res);

53390

}

53391

LLVM_FALLTHROUGH[[gnu::fallthrough]];

53392

case X86ISD::VSRAI:

53393

case X86ISD::VSHL:

53394

case X86ISD::VSRL:

53395

case X86ISD::VSRA:

53396

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

53397

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

53398

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

53399

llvm::all_of(Ops, [Op0](SDValue Op) {

53400

return Op0.getOperand(1) == Op.getOperand(1);

53401

})) {

53402

return DAG.getNode(Op0.getOpcode(), DL, VT,

53403

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

53404

}

53405

break;

53406

case X86ISD::VPERMI:

53407

case X86ISD::VROTLI:

53408

case X86ISD::VROTRI:

53409

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

53410

llvm::all_of(Ops, [Op0](SDValue Op) {

53411

return Op0.getOperand(1) == Op.getOperand(1);

53412

})) {

53413

return DAG.getNode(Op0.getOpcode(), DL, VT,

53414

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

53415

}

53416

break;

53417

case ISD::AND:

53418

case ISD::OR:

53419

case ISD::XOR:

53420

case X86ISD::ANDNP:

53421

// TODO: Add 256-bit support.

53422

if (!IsSplat && VT.is512BitVector()) {

53423

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

53424

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

53425

NumOps * SrcVT.getVectorNumElements());

53426

return DAG.getNode(Op0.getOpcode(), DL, VT,

53427

ConcatSubOperand(SrcVT, Ops, 0),

53428

ConcatSubOperand(SrcVT, Ops, 1));

53429

}

53430

break;

53431

case X86ISD::HADD:

53432

case X86ISD::HSUB:

53433

case X86ISD::FHADD:

53434

case X86ISD::FHSUB:

53435

case X86ISD::PACKSS:

53436

case X86ISD::PACKUS:

53437

if (!IsSplat && VT.is256BitVector() &&

53438

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

53439

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

53440

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

53441

NumOps * SrcVT.getVectorNumElements());

53442

return DAG.getNode(Op0.getOpcode(), DL, VT,

53443

ConcatSubOperand(SrcVT, Ops, 0),

53444

ConcatSubOperand(SrcVT, Ops, 1));

53445

}

53446

break;

53447

case X86ISD::PALIGNR:

53448

if (!IsSplat &&

53449

((VT.is256BitVector() && Subtarget.hasInt256()) ||

53450

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

53451

llvm::all_of(Ops, [Op0](SDValue Op) {

53452

return Op0.getOperand(2) == Op.getOperand(2);

53453

})) {

53454

return DAG.getNode(Op0.getOpcode(), DL, VT,

53455

ConcatSubOperand(VT, Ops, 0),

53456

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

53457

}

53458

break;

53459

}

53460

}

53461

53462

// Fold subvector loads into one.

53463

// If needed, look through bitcasts to get to the load.

53464

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

53465

bool Fast;

53466

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

53467

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

53468

*FirstLd->getMemOperand(), &Fast) &&

53469

Fast) {

53470

if (SDValue Ld =

53471

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

53472

return Ld;

53473

}

53474

}

53475

53476

// Attempt to fold target constant loads.

53477

if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

53478

SmallVector<APInt> EltBits;

53479

APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());

53480

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

53481

APInt OpUndefElts;

53482

SmallVector<APInt> OpEltBits;

53483

if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

53484

OpEltBits, true, false))

53485

break;

53486

EltBits.append(OpEltBits);

53487

UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

53488

}

53489

if (EltBits.size() == VT.getVectorNumElements())

53490

return getConstVector(EltBits, UndefElts, VT, DAG, DL);

53491

}

53492

53493

return SDValue();

53494

}

53495

53496

static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,

53497

TargetLowering::DAGCombinerInfo &DCI,

53498

const X86Subtarget &Subtarget) {

53499

EVT VT = N->getValueType(0);

53500

EVT SrcVT = N->getOperand(0).getValueType();

53501

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53502

53503

// Don't do anything for i1 vectors.

53504

if (VT.getVectorElementType() == MVT::i1)

53505

return SDValue();

53506

53507

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

53508

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

53509

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

53510

DCI, Subtarget))

53511

return R;

53512

}

53513

53514

return SDValue();

53515

}

53516

53517

static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,

53518

TargetLowering::DAGCombinerInfo &DCI,

53519

const X86Subtarget &Subtarget) {

53520

if (DCI.isBeforeLegalizeOps())

53521

return SDValue();

53522

53523

MVT OpVT = N->getSimpleValueType(0);

53524

53525

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

53526

53527

SDLoc dl(N);

53528

SDValue Vec = N->getOperand(0);

53529

SDValue SubVec = N->getOperand(1);

53530

53531

uint64_t IdxVal = N->getConstantOperandVal(2);

53532

MVT SubVecVT = SubVec.getSimpleValueType();

53533

53534

if (Vec.isUndef() && SubVec.isUndef())

53535

return DAG.getUNDEF(OpVT);

53536

53537

// Inserting undefs/zeros into zeros/undefs is a zero vector.

53538

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

53539

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

53540

return getZeroVector(OpVT, Subtarget, DAG, dl);

53541

53542

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

53543

// If we're inserting into a zero vector and then into a larger zero vector,

53544

// just insert into the larger zero vector directly.

53545

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

53546

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

53547

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

53548

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

53549

getZeroVector(OpVT, Subtarget, DAG, dl),

53550

SubVec.getOperand(1),

53551

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

53552

}

53553

53554

// If we're inserting into a zero vector and our input was extracted from an

53555

// insert into a zero vector of the same type and the extraction was at

53556

// least as large as the original insertion. Just insert the original

53557

// subvector into a zero vector.

53558

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

53559

isNullConstant(SubVec.getOperand(1)) &&

53560

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

53561

SDValue Ins = SubVec.getOperand(0);

53562

if (isNullConstant(Ins.getOperand(2)) &&

53563

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

53564

Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=

53565

SubVecVT.getFixedSizeInBits())

53566

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

53567

getZeroVector(OpVT, Subtarget, DAG, dl),

53568

Ins.getOperand(1), N->getOperand(2));

53569

}

53570

}

53571

53572

// Stop here if this is an i1 vector.

53573

if (IsI1Vector)

53574

return SDValue();

53575

53576

// If this is an insert of an extract, combine to a shuffle. Don't do this

53577

// if the insert or extract can be represented with a subregister operation.

53578

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

53579

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

53580

(IdxVal != 0 ||

53581

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

53582

int ExtIdxVal = SubVec.getConstantOperandVal(1);

53583

if (ExtIdxVal != 0) {

53584

int VecNumElts = OpVT.getVectorNumElements();

53585

int SubVecNumElts = SubVecVT.getVectorNumElements();

53586

SmallVector<int, 64> Mask(VecNumElts);

53587

// First create an identity shuffle mask.

53588

for (int i = 0; i != VecNumElts; ++i)

53589

Mask[i] = i;

53590

// Now insert the extracted portion.

53591

for (int i = 0; i != SubVecNumElts; ++i)

53592

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

53593

53594

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

53595

}

53596

}

53597

53598

// Match concat_vector style patterns.

53599

SmallVector<SDValue, 2> SubVectorOps;

53600

if (collectConcatOps(N, SubVectorOps)) {

53601

if (SDValue Fold =

53602

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

53603

return Fold;

53604

53605

// If we're inserting all zeros into the upper half, change this to

53606

// a concat with zero. We will match this to a move

53607

// with implicit upper bit zeroing during isel.

53608

// We do this here because we don't want combineConcatVectorOps to

53609

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

53610

if (SubVectorOps.size() == 2 &&

53611

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

53612

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

53613

getZeroVector(OpVT, Subtarget, DAG, dl),

53614

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

53615

}

53616

53617

// If this is a broadcast insert into an upper undef, use a larger broadcast.

53618

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

53619

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

53620

53621

// If this is a broadcast load inserted into an upper undef, use a larger

53622

// broadcast load.

53623

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

53624

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

53625

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

53626

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

53627

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

53628

SDValue BcastLd =

53629

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

53630

MemIntr->getMemoryVT(),

53631

MemIntr->getMemOperand());

53632

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

53633

return BcastLd;

53634

}

53635

53636

// If we're splatting the lower half subvector of a full vector load into the

53637

// upper half, attempt to create a subvector broadcast.

53638

if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&

53639

Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

53640

auto *VecLd = dyn_cast<LoadSDNode>(Vec);

53641

auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

53642

if (VecLd && SubLd &&

53643

DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,

53644

SubVec.getValueSizeInBits() / 8, 0))

53645

return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,

53646

SubLd, 0, DAG);

53647

}

53648

53649

return SDValue();

53650

}

53651

53652

/// If we are extracting a subvector of a vector select and the select condition

53653

/// is composed of concatenated vectors, try to narrow the select width. This

53654

/// is a common pattern for AVX1 integer code because 256-bit selects may be

53655

/// legal, but there is almost no integer math/logic available for 256-bit.

53656

/// This function should only be called with legal types (otherwise, the calls

53657

/// to get simple value types will assert).

53658

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

53659

SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));

53660

SmallVector<SDValue, 4> CatOps;

53661

if (Sel.getOpcode() != ISD::VSELECT ||

53662

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))

53663

return SDValue();

53664

53665

// Note: We assume simple value types because this should only be called with

53666

// legal operations/types.

53667

// TODO: This can be extended to handle extraction to 256-bits.

53668

MVT VT = Ext->getSimpleValueType(0);

53669

if (!VT.is128BitVector())

53670

return SDValue();

53671

53672

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

53673

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

53674

return SDValue();

53675

53676

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

53677

MVT SelVT = Sel.getSimpleValueType();

53678

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53679, __extension__
__PRETTY_FUNCTION__))

53679

"Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53679, __extension__
__PRETTY_FUNCTION__));

53680

53681

unsigned SelElts = SelVT.getVectorNumElements();

53682

unsigned CastedElts = WideVT.getVectorNumElements();

53683

unsigned ExtIdx = Ext->getConstantOperandVal(1);

53684

if (SelElts % CastedElts == 0) {

53685

// The select has the same or more (narrower) elements than the extract

53686

// operand. The extraction index gets scaled by that factor.

53687

ExtIdx *= (SelElts / CastedElts);

53688

} else if (CastedElts % SelElts == 0) {

53689

// The select has less (wider) elements than the extract operand. Make sure

53690

// that the extraction index can be divided evenly.

53691

unsigned IndexDivisor = CastedElts / SelElts;

53692

if (ExtIdx % IndexDivisor != 0)

53693

return SDValue();

53694

ExtIdx /= IndexDivisor;

53695

} else {

53696

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53696);

53697

}

53698

53699

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

53700

unsigned NarrowElts = SelElts / NarrowingFactor;

53701

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

53702

SDLoc DL(Ext);

53703

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

53704

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

53705

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

53706

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

53707

return DAG.getBitcast(VT, NarrowSel);

53708

}

53709

53710

static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,

53711

TargetLowering::DAGCombinerInfo &DCI,

53712

const X86Subtarget &Subtarget) {

53713

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

53714

// eventually get combined/lowered into ANDNP) with a concatenated operand,

53715

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

53716

// We let generic combining take over from there to simplify the

53717

// insert/extract and 'not'.

53718

// This pattern emerges during AVX1 legalization. We handle it before lowering

53719

// to avoid complications like splitting constant vector loads.

53720

53721

// Capture the original wide type in the likely case that we need to bitcast

53722

// back to this type.

53723

if (!N->getValueType(0).isSimple())

53724

return SDValue();

53725

53726

MVT VT = N->getSimpleValueType(0);

53727

SDValue InVec = N->getOperand(0);

53728

unsigned IdxVal = N->getConstantOperandVal(1);

53729

SDValue InVecBC = peekThroughBitcasts(InVec);

53730

EVT InVecVT = InVec.getValueType();

53731

unsigned SizeInBits = VT.getSizeInBits();

53732

unsigned InSizeInBits = InVecVT.getSizeInBits();

53733

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53734

53735

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

53736

TLI.isTypeLegal(InVecVT) &&

53737

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

53738

auto isConcatenatedNot = [](SDValue V) {

53739

V = peekThroughBitcasts(V);

53740

if (!isBitwiseNot(V))

53741

return false;

53742

SDValue NotOp = V->getOperand(0);

53743

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

53744

};

53745

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

53746

isConcatenatedNot(InVecBC.getOperand(1))) {

53747

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

53748

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

53749

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

53750

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

53751

}

53752

}

53753

53754

if (DCI.isBeforeLegalizeOps())

53755

return SDValue();

53756

53757

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

53758

return V;

53759

53760

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

53761

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

53762

53763

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

53764

if (VT.getScalarType() == MVT::i1)

53765

return DAG.getConstant(1, SDLoc(N), VT);

53766

return getOnesVector(VT, DAG, SDLoc(N));

53767

}

53768

53769

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

53770

return DAG.getBuildVector(

53771

VT, SDLoc(N),

53772

InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

53773

53774

// If we are extracting from an insert into a larger vector, replace with a

53775

// smaller insert if we don't access less than the original subvector. Don't

53776

// do this for i1 vectors.

53777

// TODO: Relax the matching indices requirement?

53778

if (VT.getVectorElementType() != MVT::i1 &&

53779

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&

53780

IdxVal == InVec.getConstantOperandVal(2) &&

53781

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

53782

SDLoc DL(N);

53783

SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

53784

InVec.getOperand(0), N->getOperand(1));

53785

unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;

53786

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,

53787

InVec.getOperand(1),

53788

DAG.getVectorIdxConstant(NewIdxVal, DL));

53789

}

53790

53791

// If we're extracting an upper subvector from a broadcast we should just

53792

// extract the lowest subvector instead which should allow

53793

// SimplifyDemandedVectorElts do more simplifications.

53794

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

53795

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||

53796

DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))

53797

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

53798

53799

// If we're extracting a broadcasted subvector, just use the lowest subvector.

53800

if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

53801

cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)

53802

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

53803

53804

// Attempt to extract from the source of a shuffle vector.

53805

if ((InSizeInBits % SizeInBits) == 0 &&

53806

(IdxVal % VT.getVectorNumElements()) == 0) {

53807

SmallVector<int, 32> ShuffleMask;

53808

SmallVector<int, 32> ScaledMask;

53809

SmallVector<SDValue, 2> ShuffleInputs;

53810

unsigned NumSubVecs = InSizeInBits / SizeInBits;

53811

// Decode the shuffle mask and scale it so its shuffling subvectors.

53812

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

53813

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

53814

unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();

53815

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

53816

return DAG.getUNDEF(VT);

53817

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

53818

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

53819

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

53820

if (Src.getValueSizeInBits() == InSizeInBits) {

53821

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

53822

unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();

53823

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

53824

SDLoc(N), SizeInBits);

53825

}

53826

}

53827

}

53828

53829

// If we're extracting the lowest subvector and we're the only user,

53830

// we may be able to perform this with a smaller vector width.

53831

unsigned InOpcode = InVec.getOpcode();

53832

if (InVec.hasOneUse()) {

53833

if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

53834

// v2f64 CVTDQ2PD(v4i32).

53835

if (InOpcode == ISD::SINT_TO_FP &&

53836

InVec.getOperand(0).getValueType() == MVT::v4i32) {

53837

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

53838

}

53839

// v2f64 CVTUDQ2PD(v4i32).

53840

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

53841

InVec.getOperand(0).getValueType() == MVT::v4i32) {

53842

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

53843

}

53844

// v2f64 CVTPS2PD(v4f32).

53845

if (InOpcode == ISD::FP_EXTEND &&

53846

InVec.getOperand(0).getValueType() == MVT::v4f32) {

53847

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

53848

}

53849

}

53850

if (IdxVal == 0 &&

53851

(InOpcode == ISD::ANY_EXTEND ||

53852

InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||

53853

InOpcode == ISD::ZERO_EXTEND ||

53854

InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||

53855

InOpcode == ISD::SIGN_EXTEND ||

53856

InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&

53857

(SizeInBits == 128 || SizeInBits == 256) &&

53858

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

53859

SDLoc DL(N);

53860

SDValue Ext = InVec.getOperand(0);

53861

if (Ext.getValueSizeInBits() > SizeInBits)

53862

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

53863

unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);

53864

return DAG.getNode(ExtOp, DL, VT, Ext);

53865

}

53866

if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

53867

InVec.getOperand(0).getValueType().is256BitVector() &&

53868

InVec.getOperand(1).getValueType().is256BitVector() &&

53869

InVec.getOperand(2).getValueType().is256BitVector()) {

53870

SDLoc DL(N);

53871

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

53872

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

53873

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

53874

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

53875

}

53876

if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

53877

(VT.is128BitVector() || VT.is256BitVector())) {

53878

SDLoc DL(N);

53879

SDValue InVecSrc = InVec.getOperand(0);

53880

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

53881

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

53882

return DAG.getNode(InOpcode, DL, VT, Ext);

53883

}

53884

if (InOpcode == X86ISD::MOVDDUP &&

53885

(VT.is128BitVector() || VT.is256BitVector())) {

53886

SDLoc DL(N);

53887

SDValue Ext0 =

53888

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

53889

return DAG.getNode(InOpcode, DL, VT, Ext0);

53890

}

53891

}

53892

53893

// Always split vXi64 logical shifts where we're extracting the upper 32-bits

53894

// as this is very likely to fold into a shuffle/truncation.

53895

if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

53896

InVecVT.getScalarSizeInBits() == 64 &&

53897

InVec.getConstantOperandAPInt(1) == 32) {

53898

SDLoc DL(N);

53899

SDValue Ext =

53900

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

53901

return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

53902

}

53903

53904

return SDValue();

53905

}

53906

53907

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

53908

EVT VT = N->getValueType(0);

53909

SDValue Src = N->getOperand(0);

53910

SDLoc DL(N);

53911

53912

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

53913

// This occurs frequently in our masked scalar intrinsic code and our

53914

// floating point select lowering with AVX512.

53915

// TODO: SimplifyDemandedBits instead?

53916

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())

53917

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

53918

if (C->getAPIntValue().isOne())

53919

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,

53920

Src.getOperand(0));

53921

53922

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

53923

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

53924

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

53925

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

53926

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

53927

if (C->isZero())

53928

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

53929

Src.getOperand(1));

53930

53931

// Reduce v2i64 to v4i32 if we don't need the upper bits.

53932

// TODO: Move to DAGCombine/SimplifyDemandedBits?

53933

if (VT == MVT::v2i64 || VT == MVT::v2f64) {

53934

auto IsAnyExt64 = [](SDValue Op) {

53935

if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())

53936

return SDValue();

53937

if (Op.getOpcode() == ISD::ANY_EXTEND &&

53938

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

53939

return Op.getOperand(0);

53940

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

53941

if (Ld->getExtensionType() == ISD::EXTLOAD &&

53942

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

53943

return Op;

53944

return SDValue();

53945

};

53946

if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))

53947

return DAG.getBitcast(

53948

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

53949

DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));

53950

}

53951

53952

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

53953

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

53954

Src.getOperand(0).getValueType() == MVT::x86mmx)

53955

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

53956

53957

// See if we're broadcasting the scalar value, in which case just reuse that.

53958

// Ensure the same SDValue from the SDNode use is being used.

53959

if (VT.getScalarType() == Src.getValueType())

53960

for (SDNode *User : Src->uses())

53961

if (User->getOpcode() == X86ISD::VBROADCAST &&

53962

Src == User->getOperand(0)) {

53963

unsigned SizeInBits = VT.getFixedSizeInBits();

53964

unsigned BroadcastSizeInBits =

53965

User->getValueSizeInBits(0).getFixedSize();

53966

if (BroadcastSizeInBits == SizeInBits)

53967

return SDValue(User, 0);

53968

if (BroadcastSizeInBits > SizeInBits)

53969

return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

53970

// TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

53971

// coverage.

53972

}

53973

53974

return SDValue();

53975

}

53976

53977

// Simplify PMULDQ and PMULUDQ operations.

53978

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

53979

TargetLowering::DAGCombinerInfo &DCI,

53980

const X86Subtarget &Subtarget) {

53981

SDValue LHS = N->getOperand(0);

53982

SDValue RHS = N->getOperand(1);

53983

53984

// Canonicalize constant to RHS.

53985

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

53986

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

53987

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

53988

53989

// Multiply by zero.

53990

// Don't return RHS as it may contain UNDEFs.

53991

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

53992

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

53993

53994

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

53995

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53996

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

53997

return SDValue(N, 0);

53998

53999

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

54000

// convert it to any_extend_invec, due to the LegalOperations check, do the

54001

// conversion directly to a vector shuffle manually. This exposes combine

54002

// opportunities missed by combineEXTEND_VECTOR_INREG not calling

54003

// combineX86ShufflesRecursively on SSE4.1 targets.

54004

// FIXME: This is basically a hack around several other issues related to

54005

// ANY_EXTEND_VECTOR_INREG.

54006

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

54007

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

54008

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

54009

LHS.getOperand(0).getValueType() == MVT::v4i32) {

54010

SDLoc dl(N);

54011

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

54012

LHS.getOperand(0), { 0, -1, 1, -1 });

54013

LHS = DAG.getBitcast(MVT::v2i64, LHS);

54014

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

54015

}

54016

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

54017

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

54018

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

54019

RHS.getOperand(0).getValueType() == MVT::v4i32) {

54020

SDLoc dl(N);

54021

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

54022

RHS.getOperand(0), { 0, -1, 1, -1 });

54023

RHS = DAG.getBitcast(MVT::v2i64, RHS);

54024

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

54025

}

54026

54027

return SDValue();

54028

}

54029

54030

// Simplify VPMADDUBSW/VPMADDWD operations.

54031

static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

54032

TargetLowering::DAGCombinerInfo &DCI) {

54033

EVT VT = N->getValueType(0);

54034

SDValue LHS = N->getOperand(0);

54035

SDValue RHS = N->getOperand(1);

54036

54037

// Multiply by zero.

54038

// Don't return LHS/RHS as it may contain UNDEFs.

54039

if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

54040

ISD::isBuildVectorAllZeros(RHS.getNode()))

54041

return DAG.getConstant(0, SDLoc(N), VT);

54042

54043

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54044

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

54045

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

54046

return SDValue(N, 0);

54047

54048

return SDValue();

54049

}

54050

54051

static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

54052

TargetLowering::DAGCombinerInfo &DCI,

54053

const X86Subtarget &Subtarget) {

54054

EVT VT = N->getValueType(0);

54055

SDValue In = N->getOperand(0);

54056

unsigned Opcode = N->getOpcode();

54057

unsigned InOpcode = In.getOpcode();

54058

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54059

SDLoc DL(N);

54060

54061

// Try to merge vector loads and extend_inreg to an extload.

54062

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

54063

In.hasOneUse()) {

54064

auto *Ld = cast<LoadSDNode>(In);

54065

if (Ld->isSimple()) {

54066

MVT SVT = In.getSimpleValueType().getVectorElementType();

54067

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

54068

? ISD::SEXTLOAD

54069

: ISD::ZEXTLOAD;

54070

EVT MemVT = VT.changeVectorElementType(SVT);

54071

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

54072

SDValue Load = DAG.getExtLoad(

54073

Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

54074

MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());

54075

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

54076

return Load;

54077

}

54078

}

54079

}

54080

54081

// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

54082

if (Opcode == InOpcode)

54083

return DAG.getNode(Opcode, DL, VT, In.getOperand(0));

54084

54085

// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

54086

// -> EXTEND_VECTOR_INREG(X).

54087

// TODO: Handle non-zero subvector indices.

54088

if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

54089

In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&

54090

In.getOperand(0).getOperand(0).getValueSizeInBits() ==

54091

In.getValueSizeInBits())

54092

return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));

54093

54094

// Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

54095

// TODO: Move to DAGCombine?

54096

if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

54097

In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

54098

In.getValueSizeInBits() == VT.getSizeInBits()) {

54099

unsigned NumElts = VT.getVectorNumElements();

54100

unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

54101

EVT EltVT = In.getOperand(0).getValueType();

54102

SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

54103

for (unsigned I = 0; I != NumElts; ++I)

54104

Elts[I * Scale] = In.getOperand(I);

54105

return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

54106

}

54107

54108

// Attempt to combine as a shuffle.

54109

// TODO: General ZERO_EXTEND_VECTOR_INREG support.

54110

if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

54111

(Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {

54112

SDValue Op(N, 0);

54113

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

54114

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

54115

return Res;

54116

}

54117

54118

return SDValue();

54119

}

54120

54121

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

54122

TargetLowering::DAGCombinerInfo &DCI) {

54123

EVT VT = N->getValueType(0);

54124

54125

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

54126

return DAG.getConstant(0, SDLoc(N), VT);

54127

54128

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54129

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

54130

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

54131

return SDValue(N, 0);

54132

54133

return SDValue();

54134

}

54135

54136

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

54137

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

54138

// extra instructions between the conversion due to going to scalar and back.

54139

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

54140

const X86Subtarget &Subtarget) {

54141

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

54142

return SDValue();

54143

54144

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

54145

return SDValue();

54146

54147

if (N->getValueType(0) != MVT::f32 ||

54148

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

54149

return SDValue();

54150

54151

SDLoc dl(N);

54152

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

54153

N->getOperand(0).getOperand(0));

54154

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

54155

DAG.getTargetConstant(4, dl, MVT::i32));

54156

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

54157

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

54158

DAG.getIntPtrConstant(0, dl));

54159

}

54160

54161

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

54162

const X86Subtarget &Subtarget) {

54163

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

54164

return SDValue();

54165

54166

if (Subtarget.hasFP16())

54167

return SDValue();

54168

54169

bool IsStrict = N->isStrictFPOpcode();

54170

EVT VT = N->getValueType(0);

54171

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

54172

EVT SrcVT = Src.getValueType();

54173

54174

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

54175

return SDValue();

54176

54177

if (VT.getVectorElementType() != MVT::f32 &&

54178

VT.getVectorElementType() != MVT::f64)

54179

return SDValue();

54180

54181

unsigned NumElts = VT.getVectorNumElements();

54182

if (NumElts == 1 || !isPowerOf2_32(NumElts))

54183

return SDValue();

54184

54185

SDLoc dl(N);

54186

54187

// Convert the input to vXi16.

54188

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

54189

Src = DAG.getBitcast(IntVT, Src);

54190

54191

// Widen to at least 8 input elements.

54192

if (NumElts < 8) {

54193

unsigned NumConcats = 8 / NumElts;

54194

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

54195

: DAG.getConstant(0, dl, IntVT);

54196

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

54197

Ops[0] = Src;

54198

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

54199

}

54200

54201

// Destination is vXf32 with at least 4 elements.

54202

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

54203

std::max(4U, NumElts));

54204

SDValue Cvt, Chain;

54205

if (IsStrict) {

54206

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

54207

{N->getOperand(0), Src});

54208

Chain = Cvt.getValue(1);

54209

} else {

54210

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

54211

}

54212

54213

if (NumElts < 4) {

54214

assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54214, __extension__
__PRETTY_FUNCTION__));

54215

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

54216

DAG.getIntPtrConstant(0, dl));

54217

}

54218

54219

if (IsStrict) {

54220

// Extend to the original VT if necessary.

54221

if (Cvt.getValueType() != VT) {

54222

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

54223

{Chain, Cvt});

54224

Chain = Cvt.getValue(1);

54225

}

54226

return DAG.getMergeValues({Cvt, Chain}, dl);

54227

}

54228

54229

// Extend to the original VT if necessary.

54230

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

54231

}

54232

54233

// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract

54234

// from. Limit this to cases where the loads have the same input chain and the

54235

// output chains are unused. This avoids any memory ordering issues.

54236

static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

54237

TargetLowering::DAGCombinerInfo &DCI) {

54238

assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))

54239

N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__))

54240

"Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54240, __extension__
__PRETTY_FUNCTION__));

54241

54242

// Only do this if the chain result is unused.

54243

if (N->hasAnyUseOfValue(1))

54244

return SDValue();

54245

54246

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

54247

54248

SDValue Ptr = MemIntrin->getBasePtr();

54249

SDValue Chain = MemIntrin->getChain();

54250

EVT VT = N->getSimpleValueType(0);

54251

EVT MemVT = MemIntrin->getMemoryVT();

54252

54253

// Look at other users of our base pointer and try to find a wider broadcast.

54254

// The input chain and the size of the memory VT must match.

54255

for (SDNode *User : Ptr->uses())

54256

if (User != N && User->getOpcode() == N->getOpcode() &&

54257

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

54258

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

54259

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

54260

MemVT.getSizeInBits() &&

54261

!User->hasAnyUseOfValue(1) &&

54262

User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {

54263

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

54264

VT.getSizeInBits());

54265

Extract = DAG.getBitcast(VT, Extract);

54266

return DCI.CombineTo(N, Extract, SDValue(User, 1));

54267

}

54268

54269

return SDValue();

54270

}

54271

54272

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

54273

const X86Subtarget &Subtarget) {

54274

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

54275

return SDValue();

54276

54277

if (Subtarget.hasFP16())

54278

return SDValue();

54279

54280

EVT VT = N->getValueType(0);

54281

SDValue Src = N->getOperand(0);

54282

EVT SrcVT = Src.getValueType();

54283

54284

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

54285

SrcVT.getVectorElementType() != MVT::f32)

54286

return SDValue();

54287

54288

unsigned NumElts = VT.getVectorNumElements();

54289

if (NumElts == 1 || !isPowerOf2_32(NumElts))

54290

return SDValue();

54291

54292

SDLoc dl(N);

54293

54294

// Widen to at least 4 input elements.

54295

if (NumElts < 4)

54296

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

54297

DAG.getConstantFP(0.0, dl, SrcVT));

54298

54299

// Destination is v8i16 with at least 8 elements.

54300

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

54301

std::max(8U, NumElts));

54302

SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,

54303

DAG.getTargetConstant(4, dl, MVT::i32));

54304

54305

// Extract down to real number of elements.

54306

if (NumElts < 8) {

54307

EVT IntVT = VT.changeVectorElementTypeToInteger();

54308

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

54309

DAG.getIntPtrConstant(0, dl));

54310

}

54311

54312

return DAG.getBitcast(VT, Cvt);

54313

}

54314

54315

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

54316

SDValue Src = N->getOperand(0);

54317

54318

// Turn MOVDQ2Q+simple_load into an mmx load.

54319

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

54320

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

54321

54322

if (LN->isSimple()) {

54323

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

54324

LN->getBasePtr(),

54325

LN->getPointerInfo(),

54326

LN->getOriginalAlign(),

54327

LN->getMemOperand()->getFlags());

54328

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

54329

return NewLd;

54330

}

54331

}

54332

54333

return SDValue();

54334

}

54335

54336

static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

54337

TargetLowering::DAGCombinerInfo &DCI) {

54338

unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

54339

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54340

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

54341

return SDValue(N, 0);

54342

54343

return SDValue();

54344

}

54345

54346

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

54347

DAGCombinerInfo &DCI) const {

54348

SelectionDAG &DAG = DCI.DAG;

54349

switch (N->getOpcode()) {

54350

default: break;

54351

case ISD::SCALAR_TO_VECTOR:

54352

return combineScalarToVector(N, DAG);

54353

case ISD::EXTRACT_VECTOR_ELT:

54354

case X86ISD::PEXTRW:

54355

case X86ISD::PEXTRB:

54356

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

54357

case ISD::CONCAT_VECTORS:

54358

return combineConcatVectors(N, DAG, DCI, Subtarget);

54359

case ISD::INSERT_SUBVECTOR:

54360

return combineInsertSubvector(N, DAG, DCI, Subtarget);

54361

case ISD::EXTRACT_SUBVECTOR:

54362

return combineExtractSubvector(N, DAG, DCI, Subtarget);

54363

case ISD::VSELECT:

54364

case ISD::SELECT:

54365

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

54366

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

54367

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

54368

case X86ISD::CMP: return combineCMP(N, DAG);

54369

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

54370

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

54371

case X86ISD::ADD:

54372

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

54373

case X86ISD::SBB: return combineSBB(N, DAG);

54374

case X86ISD::ADC: return combineADC(N, DAG, DCI);

54375

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

54376

case ISD::SHL: return combineShiftLeft(N, DAG);

54377

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

54378

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

54379

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

54380

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

54381

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

54382

case X86ISD::BEXTR:

54383

case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);

54384

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

54385

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

54386

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

54387

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

54388

case X86ISD::VEXTRACT_STORE:

54389

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

54390

case ISD::SINT_TO_FP:

54391

case ISD::STRICT_SINT_TO_FP:

54392

return combineSIntToFP(N, DAG, DCI, Subtarget);

54393

case ISD::UINT_TO_FP:

54394

case ISD::STRICT_UINT_TO_FP:

54395

return combineUIntToFP(N, DAG, Subtarget);

54396

case ISD::FADD:

54397

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

54398

case X86ISD::VFCMULC:

54399

case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);

54400

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

54401

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

54402

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

54403

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

54404

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

54405

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

54406

case X86ISD::FXOR:

54407

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

54408

case X86ISD::FMIN:

54409

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

54410

case ISD::FMINNUM:

54411

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

54412

case X86ISD::CVTSI2P:

54413

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

54414

case X86ISD::CVTP2SI:

54415

case X86ISD::CVTP2UI:

54416

case X86ISD::STRICT_CVTTP2SI:

54417

case X86ISD::CVTTP2SI:

54418

case X86ISD::STRICT_CVTTP2UI:

54419

case X86ISD::CVTTP2UI:

54420

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

54421

case X86ISD::STRICT_CVTPH2PS:

54422

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

54423

case X86ISD::BT: return combineBT(N, DAG, DCI);

54424

case ISD::ANY_EXTEND:

54425

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

54426

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

54427

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

54428

case ISD::ANY_EXTEND_VECTOR_INREG:

54429

case ISD::SIGN_EXTEND_VECTOR_INREG:

54430

case ISD::ZERO_EXTEND_VECTOR_INREG:

54431

return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

54432

case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);

54433

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

54434

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

54435

case X86ISD::PACKSS:

54436

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

54437

case X86ISD::HADD:

54438

case X86ISD::HSUB:

54439

case X86ISD::FHADD:

54440

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

54441

case X86ISD::VSHL:

54442

case X86ISD::VSRA:

54443

case X86ISD::VSRL:

54444

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

54445

case X86ISD::VSHLI:

54446

case X86ISD::VSRAI:

54447

case X86ISD::VSRLI:

54448

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

54449

case ISD::INSERT_VECTOR_ELT:

54450

case X86ISD::PINSRB:

54451

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

54452

case X86ISD::SHUFP: // Handle all target specific shuffles

54453

case X86ISD::INSERTPS:

54454

case X86ISD::EXTRQI:

54455

case X86ISD::INSERTQI:

54456

case X86ISD::VALIGN:

54457

case X86ISD::PALIGNR:

54458

case X86ISD::VSHLDQ:

54459

case X86ISD::VSRLDQ:

54460

case X86ISD::BLENDI:

54461

case X86ISD::UNPCKH:

54462

case X86ISD::UNPCKL:

54463

case X86ISD::MOVHLPS:

54464

case X86ISD::MOVLHPS:

54465

case X86ISD::PSHUFB:

54466

case X86ISD::PSHUFD:

54467

case X86ISD::PSHUFHW:

54468

case X86ISD::PSHUFLW:

54469

case X86ISD::MOVSHDUP:

54470

case X86ISD::MOVSLDUP:

54471

case X86ISD::MOVDDUP:

54472

case X86ISD::MOVSS:

54473

case X86ISD::MOVSD:

54474

case X86ISD::MOVSH:

54475

case X86ISD::VBROADCAST:

54476

case X86ISD::VPPERM:

54477

case X86ISD::VPERMI:

54478

case X86ISD::VPERMV:

54479

case X86ISD::VPERMV3:

54480

case X86ISD::VPERMIL2:

54481

case X86ISD::VPERMILPI:

54482

case X86ISD::VPERMILPV:

54483

case X86ISD::VPERM2X128:

54484

case X86ISD::SHUF128:

54485

case X86ISD::VZEXT_MOVL:

54486

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

54487

case X86ISD::FMADD_RND:

54488

case X86ISD::FMSUB:

54489

case X86ISD::STRICT_FMSUB:

54490

case X86ISD::FMSUB_RND:

54491

case X86ISD::FNMADD:

54492

case X86ISD::STRICT_FNMADD:

54493

case X86ISD::FNMADD_RND:

54494

case X86ISD::FNMSUB:

54495

case X86ISD::STRICT_FNMSUB:

54496

case X86ISD::FNMSUB_RND:

54497

case ISD::FMA:

54498

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

54499

case X86ISD::FMADDSUB_RND:

54500

case X86ISD::FMSUBADD_RND:

54501

case X86ISD::FMADDSUB:

54502

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

54503

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

54504

case X86ISD::MGATHER:

54505

case X86ISD::MSCATTER:

54506

return combineX86GatherScatter(N, DAG, DCI, Subtarget);

54507

case ISD::MGATHER:

54508

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

54509

case X86ISD::PCMPEQ:

54510

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

54511

case X86ISD::PMULDQ:

54512

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

54513

case X86ISD::VPMADDUBSW:

54514

case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);

54515

case X86ISD::KSHIFTL:

54516

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

54517

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

54518

case ISD::STRICT_FP_EXTEND:

54519

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

54520

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

54521

case X86ISD::VBROADCAST_LOAD:

54522

case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

54523

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

54524

case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);

54525

}

54526

54527

return SDValue();

54528

}

54529

54530

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

54531

if (!isTypeLegal(VT))

54532

return false;

54533

54534

// There are no vXi8 shifts.

54535

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

54536

return false;

54537

54538

// TODO: Almost no 8-bit ops are desirable because they have no actual

54539

// size/speed advantages vs. 32-bit ops, but they do have a major

54540

// potential disadvantage by causing partial register stalls.

54541

//

54542

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

54543

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

54544

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

54545

// check for a constant operand to the multiply.

54546

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

54547

return false;

54548

54549

// i16 instruction encodings are longer and some i16 instructions are slow,

54550

// so those are not desirable.

54551

if (VT == MVT::i16) {

54552

switch (Opc) {

54553

default:

54554

break;

54555

case ISD::LOAD:

54556

case ISD::SIGN_EXTEND:

54557

case ISD::ZERO_EXTEND:

54558

case ISD::ANY_EXTEND:

54559

case ISD::SHL:

54560

case ISD::SRA:

54561

case ISD::SRL:

54562

case ISD::SUB:

54563

case ISD::ADD:

54564

case ISD::MUL:

54565

case ISD::AND:

54566

case ISD::OR:

54567

case ISD::XOR:

54568

return false;

54569

}

54570

}

54571

54572

// Any legal type not explicitly accounted for above here is desirable.

54573

return true;

54574

}

54575

54576

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

54577

SDValue Value, SDValue Addr,

54578

SelectionDAG &DAG) const {

54579

const Module *M = DAG.getMachineFunction().getMMI().getModule();

54580

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

54581

if (IsCFProtectionSupported) {

54582

// In case control-flow branch protection is enabled, we need to add

54583

// notrack prefix to the indirect branch.

54584

// In order to do that we create NT_BRIND SDNode.

54585

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

54586

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

54587

}

54588

54589

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

54590

}

54591

54592

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

54593

EVT VT = Op.getValueType();

54594

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

54595

isa<ConstantSDNode>(Op.getOperand(1));

54596

54597

// i16 is legal, but undesirable since i16 instruction encodings are longer

54598

// and some i16 instructions are slow.

54599

// 8-bit multiply-by-constant can usually be expanded to something cheaper

54600

// using LEA and/or other ALU ops.

54601

if (VT != MVT::i16 && !Is8BitMulByConstant)

54602

return false;

54603

54604

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

54605

if (!Op.hasOneUse())

54606

return false;

54607

SDNode *User = *Op->use_begin();

54608

if (!ISD::isNormalStore(User))

54609

return false;

54610

auto *Ld = cast<LoadSDNode>(Load);

54611

auto *St = cast<StoreSDNode>(User);

54612

return Ld->getBasePtr() == St->getBasePtr();

54613

};

54614

54615

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

54616

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

54617

return false;

54618

if (!Op.hasOneUse())

54619

return false;

54620

SDNode *User = *Op->use_begin();

54621

if (User->getOpcode() != ISD::ATOMIC_STORE)

54622

return false;

54623

auto *Ld = cast<AtomicSDNode>(Load);

54624

auto *St = cast<AtomicSDNode>(User);

54625

return Ld->getBasePtr() == St->getBasePtr();

54626

};

54627

54628

bool Commute = false;

54629

switch (Op.getOpcode()) {

54630

default: return false;

54631

case ISD::SIGN_EXTEND:

54632

case ISD::ZERO_EXTEND:

54633

case ISD::ANY_EXTEND:

54634

break;

54635

case ISD::SHL:

54636

case ISD::SRA:

54637

case ISD::SRL: {

54638

SDValue N0 = Op.getOperand(0);

54639

// Look out for (store (shl (load), x)).

54640

if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

54641

return false;

54642

break;

54643

}

54644

case ISD::ADD:

54645

case ISD::MUL:

54646

case ISD::AND:

54647

case ISD::OR:

54648

case ISD::XOR:

54649

Commute = true;

54650

LLVM_FALLTHROUGH[[gnu::fallthrough]];

54651

case ISD::SUB: {

54652

SDValue N0 = Op.getOperand(0);

54653

SDValue N1 = Op.getOperand(1);

54654

// Avoid disabling potential load folding opportunities.

54655

if (X86::mayFoldLoad(N1, Subtarget) &&

54656

(!Commute || !isa<ConstantSDNode>(N0) ||

54657

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

54658

return false;

54659

if (X86::mayFoldLoad(N0, Subtarget) &&

54660

((Commute && !isa<ConstantSDNode>(N1)) ||

54661

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

54662

return false;

54663

if (IsFoldableAtomicRMW(N0, Op) ||

54664

(Commute && IsFoldableAtomicRMW(N1, Op)))

54665

return false;

54666

}

54667

}

54668

54669

PVT = MVT::i32;

54670

return true;

54671

}

54672

54673

//===----------------------------------------------------------------------===//

54674

// X86 Inline Assembly Support

54675

//===----------------------------------------------------------------------===//

54676

54677

// Helper to match a string separated by whitespace.

54678

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

54679

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

54680

54681

for (StringRef Piece : Pieces) {

54682

if (!S.startswith(Piece)) // Check if the piece matches.

54683

return false;

54684

54685

S = S.substr(Piece.size());

54686

StringRef::size_type Pos = S.find_first_not_of(" \t");

54687

if (Pos == 0) // We matched a prefix.

54688

return false;

54689

54690

S = S.substr(Pos);

54691

}

54692

54693

return S.empty();

54694

}

54695

54696

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

54697

54698

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

54699

if (llvm::is_contained(AsmPieces, "~{cc}") &&

54700

llvm::is_contained(AsmPieces, "~{flags}") &&

54701

llvm::is_contained(AsmPieces, "~{fpsr}")) {

54702

54703

if (AsmPieces.size() == 3)

54704

return true;

54705

else if (llvm::is_contained(AsmPieces, "~{dirflag}"))

54706

return true;

54707

}

54708

}

54709

return false;

54710

}

54711

54712

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

54713

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

54714

54715

const std::string &AsmStr = IA->getAsmString();

54716

54717

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

54718

if (!Ty || Ty->getBitWidth() % 16 != 0)

54719

return false;

54720

54721

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

54722

SmallVector<StringRef, 4> AsmPieces;

54723

SplitString(AsmStr, AsmPieces, ";\n");

54724

54725

switch (AsmPieces.size()) {

54726

default: return false;

54727

case 1:

54728

// FIXME: this should verify that we are targeting a 486 or better. If not,

54729

// we will turn this bswap into something that will be lowered to logical

54730

// ops instead of emitting the bswap asm. For now, we don't support 486 or

54731

// lower so don't worry about this.

54732

// bswap $0

54733

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

54734

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

54735

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

54736

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

54737

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

54738

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

54739

// No need to check constraints, nothing other than the equivalent of

54740

// "=r,0" would be valid here.

54741

return IntrinsicLowering::LowerToByteSwap(CI);

54742

}

54743

54744

// rorw $$8, ${0:w} --> llvm.bswap.i16

54745

if (CI->getType()->isIntegerTy(16) &&

54746

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

54747

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

54748

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

54749

AsmPieces.clear();

54750

StringRef ConstraintsStr = IA->getConstraintString();

54751

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

54752

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

54753

if (clobbersFlagRegisters(AsmPieces))

54754

return IntrinsicLowering::LowerToByteSwap(CI);

54755

}

54756

break;

54757

case 3:

54758

if (CI->getType()->isIntegerTy(32) &&

54759

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

54760

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

54761

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

54762

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

54763

AsmPieces.clear();

54764

StringRef ConstraintsStr = IA->getConstraintString();

54765

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

54766

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

54767

if (clobbersFlagRegisters(AsmPieces))

54768

return IntrinsicLowering::LowerToByteSwap(CI);

54769

}

54770

54771

if (CI->getType()->isIntegerTy(64)) {

54772

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

54773

if (Constraints.size() >= 2 &&

54774

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

54775

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

54776

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

54777

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

54778

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

54779

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

54780

return IntrinsicLowering::LowerToByteSwap(CI);

54781

}

54782

}

54783

break;

54784

}

54785

return false;

54786

}

54787

54788

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

54789

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

54790

.Case("{@cca}", X86::COND_A)

54791

.Case("{@ccae}", X86::COND_AE)

54792

.Case("{@ccb}", X86::COND_B)

54793

.Case("{@ccbe}", X86::COND_BE)

54794

.Case("{@ccc}", X86::COND_B)

54795

.Case("{@cce}", X86::COND_E)

54796

.Case("{@ccz}", X86::COND_E)

54797

.Case("{@ccg}", X86::COND_G)

54798

.Case("{@ccge}", X86::COND_GE)

54799

.Case("{@ccl}", X86::COND_L)

54800

.Case("{@ccle}", X86::COND_LE)

54801

.Case("{@ccna}", X86::COND_BE)

54802

.Case("{@ccnae}", X86::COND_B)

54803

.Case("{@ccnb}", X86::COND_AE)

54804

.Case("{@ccnbe}", X86::COND_A)

54805

.Case("{@ccnc}", X86::COND_AE)

54806

.Case("{@ccne}", X86::COND_NE)

54807

.Case("{@ccnz}", X86::COND_NE)

54808

.Case("{@ccng}", X86::COND_LE)

54809

.Case("{@ccnge}", X86::COND_L)

54810

.Case("{@ccnl}", X86::COND_GE)

54811

.Case("{@ccnle}", X86::COND_G)

54812

.Case("{@ccno}", X86::COND_NO)

54813

.Case("{@ccnp}", X86::COND_NP)

54814

.Case("{@ccns}", X86::COND_NS)

54815

.Case("{@cco}", X86::COND_O)

54816

.Case("{@ccp}", X86::COND_P)

54817

.Case("{@ccs}", X86::COND_S)

54818

.Default(X86::COND_INVALID);

54819

return Cond;

54820

}

54821

54822

/// Given a constraint letter, return the type of constraint for this target.

54823

X86TargetLowering::ConstraintType

54824

X86TargetLowering::getConstraintType(StringRef Constraint) const {

54825

if (Constraint.size() == 1) {

54826

switch (Constraint[0]) {

54827

case 'R':

54828

case 'q':

54829

case 'Q':

54830

case 'f':

54831

case 't':

54832

case 'u':

54833

case 'y':

54834

case 'x':

54835

case 'v':

54836

case 'l':

54837

case 'k': // AVX512 masking registers.

54838

return C_RegisterClass;

54839

case 'a':

54840

case 'b':

54841

case 'c':

54842

case 'd':

54843

case 'S':

54844

case 'D':

54845

case 'A':

54846

return C_Register;

54847

case 'I':

54848

case 'J':

54849

case 'K':

54850

case 'N':

54851

case 'G':

54852

case 'L':

54853

case 'M':

54854

return C_Immediate;

54855

case 'C':

54856

case 'e':

54857

case 'Z':

54858

return C_Other;

54859

default:

54860

break;

54861

}

54862

}

54863

else if (Constraint.size() == 2) {

54864

switch (Constraint[0]) {

54865

default:

54866

break;

54867

case 'Y':

54868

switch (Constraint[1]) {

54869

default:

54870

break;

54871

case 'z':

54872

return C_Register;

54873

case 'i':

54874

case 'm':

54875

case 'k':

54876

case 't':

54877

case '2':

54878

return C_RegisterClass;

54879

}

54880

}

54881

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

54882

return C_Other;

54883

return TargetLowering::getConstraintType(Constraint);

54884

}

54885

54886

/// Examine constraint type and operand type and determine a weight value.

54887

/// This object must already have been set up with the operand type

54888

/// and the current alternative constraint selected.

54889

TargetLowering::ConstraintWeight

54890

X86TargetLowering::getSingleConstraintMatchWeight(

54891

AsmOperandInfo &info, const char *constraint) const {

54892

ConstraintWeight weight = CW_Invalid;

54893

Value *CallOperandVal = info.CallOperandVal;

54894

// If we don't have a value, we can't do a match,

54895

// but allow it at the lowest weight.

54896

if (!CallOperandVal)

54897

return CW_Default;

54898

Type *type = CallOperandVal->getType();

54899

// Look at the constraint type.

54900

switch (*constraint) {

54901

default:

54902

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

54903

LLVM_FALLTHROUGH[[gnu::fallthrough]];

54904

case 'R':

54905

case 'q':

54906

case 'Q':

54907

case 'a':

54908

case 'b':

54909

case 'c':

54910

case 'd':

54911

case 'S':

54912

case 'D':

54913

case 'A':

54914

if (CallOperandVal->getType()->isIntegerTy())

54915

weight = CW_SpecificReg;

54916

break;

54917

case 'f':

54918

case 't':

54919

case 'u':

54920

if (type->isFloatingPointTy())

54921

weight = CW_SpecificReg;

54922

break;

54923

case 'y':

54924

if (type->isX86_MMXTy() && Subtarget.hasMMX())

54925

weight = CW_SpecificReg;

54926

break;

54927

case 'Y':

54928

if (StringRef(constraint).size() != 2)

54929

break;

54930

switch (constraint[1]) {

54931

default:

54932

return CW_Invalid;

54933

// XMM0

54934

case 'z':

54935

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

54936

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

54937

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

54938

return CW_SpecificReg;

54939

return CW_Invalid;

54940

// Conditional OpMask regs (AVX512)

54941

case 'k':

54942

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

54943

return CW_Register;

54944

return CW_Invalid;

54945

// Any MMX reg

54946

case 'm':

54947

if (type->isX86_MMXTy() && Subtarget.hasMMX())

54948

return weight;

54949

return CW_Invalid;

54950

// Any SSE reg when ISA >= SSE2, same as 'x'

54951

case 'i':

54952

case 't':

54953

case '2':

54954

if (!Subtarget.hasSSE2())

54955

return CW_Invalid;

54956

break;

54957

}

54958

break;

54959

case 'v':

54960

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

54961

weight = CW_Register;

54962

LLVM_FALLTHROUGH[[gnu::fallthrough]];

54963

case 'x':

54964

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

54965

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

54966

weight = CW_Register;

54967

break;

54968

case 'k':

54969

// Enable conditional vector operations using %k<#> registers.

54970

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

54971

weight = CW_Register;

54972

break;

54973

case 'I':

54974

if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

54975

if (C->getZExtValue() <= 31)

54976

weight = CW_Constant;

54977

}

54978

break;

54979

case 'J':

54980

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

54981

if (C->getZExtValue() <= 63)

54982

weight = CW_Constant;

54983

}

54984

break;

54985

case 'K':

54986

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

54987

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

54988

weight = CW_Constant;

54989

}

54990

break;

54991

case 'L':

54992

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

54993

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

54994

weight = CW_Constant;

54995

}

54996

break;

54997

case 'M':

54998

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

54999

if (C->getZExtValue() <= 3)

55000

weight = CW_Constant;

55001

}

55002

break;

55003

case 'N':

55004

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

55005

if (C->getZExtValue() <= 0xff)

55006

weight = CW_Constant;

55007

}

55008

break;

55009

case 'G':

55010

case 'C':

55011

if (isa<ConstantFP>(CallOperandVal)) {

55012

weight = CW_Constant;

55013

}

55014

break;

55015

case 'e':

55016

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

55017

if ((C->getSExtValue() >= -0x80000000LL) &&

55018

(C->getSExtValue() <= 0x7fffffffLL))

55019

weight = CW_Constant;

55020

}

55021

break;

55022

case 'Z':

55023

if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {

55024

if (C->getZExtValue() <= 0xffffffff)

55025

weight = CW_Constant;

55026

}

55027

break;

55028

}

55029

return weight;

55030

}

55031

55032

/// Try to replace an X constraint, which matches anything, with another that

55033

/// has more specific requirements based on the type of the corresponding

55034

/// operand.

55035

const char *X86TargetLowering::

55036

LowerXConstraint(EVT ConstraintVT) const {

55037

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

55038

// 'f' like normal targets.

55039

if (ConstraintVT.isFloatingPoint()) {

55040

if (Subtarget.hasSSE1())

55041

return "x";

55042

}

55043

55044

return TargetLowering::LowerXConstraint(ConstraintVT);

55045

}

55046

55047

// Lower @cc targets via setcc.

55048

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

55049

SDValue &Chain, SDValue &Flag, const SDLoc &DL,

55050

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

55051

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

55052

if (Cond == X86::COND_INVALID)

55053

return SDValue();

55054

// Check that return type is valid.

55055

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

55056

OpInfo.ConstraintVT.getSizeInBits() < 8)

55057

report_fatal_error("Flag output operand is of invalid type");

55058

55059

// Get EFLAGS register. Only update chain when copyfrom is glued.

55060

if (Flag.getNode()) {

55061

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);

55062

Chain = Flag.getValue(1);

55063

} else

55064

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

55065

// Extract CC code.

55066

SDValue CC = getSETCC(Cond, Flag, DL, DAG);

55067

// Extend to 32-bits

55068

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

55069

55070

return Result;

55071

}

55072

55073

/// Lower the specified operand into the Ops vector.

55074

/// If it is invalid, don't add anything to Ops.

55075

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

55076

std::string &Constraint,

55077

std::vector<SDValue>&Ops,

55078

SelectionDAG &DAG) const {

55079

SDValue Result;

55080

55081

// Only support length 1 constraints for now.

55082

if (Constraint.length() > 1) return;

55083

55084

char ConstraintLetter = Constraint[0];

55085

switch (ConstraintLetter) {

55086

default: break;

55087

case 'I':

55088

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55089

if (C->getZExtValue() <= 31) {

55090

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55091

Op.getValueType());

55092

break;

55093

}

55094

}

55095

return;

55096

case 'J':

55097

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55098

if (C->getZExtValue() <= 63) {

55099

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55100

Op.getValueType());

55101

break;

55102

}

55103

}

55104

return;

55105

case 'K':

55106

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55107

if (isInt<8>(C->getSExtValue())) {

55108

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55109

Op.getValueType());

55110

break;

55111

}

55112

}

55113

return;

55114

case 'L':

55115

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55116

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

55117

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

55118

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

55119

Op.getValueType());

55120

break;

55121

}

55122

}

55123

return;

55124

case 'M':

55125

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55126

if (C->getZExtValue() <= 3) {

55127

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55128

Op.getValueType());

55129

break;

55130

}

55131

}

55132

return;

55133

case 'N':

55134

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55135

if (C->getZExtValue() <= 255) {

55136

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55137

Op.getValueType());

55138

break;

55139

}

55140

}

55141

return;

55142

case 'O':

55143

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55144

if (C->getZExtValue() <= 127) {

55145

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55146

Op.getValueType());

55147

break;

55148

}

55149

}

55150

return;

55151

case 'e': {

55152

// 32-bit signed value

55153

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55154

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

55155

C->getSExtValue())) {

55156

// Widen to 64 bits here to get it sign extended.

55157

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

55158

break;

55159

}

55160

// FIXME gcc accepts some relocatable values here too, but only in certain

55161

// memory models; it's complicated.

55162

}

55163

return;

55164

}

55165

case 'Z': {

55166

// 32-bit unsigned value

55167

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

55168

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

55169

C->getZExtValue())) {

55170

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

55171

Op.getValueType());

55172

break;

55173

}

55174

}

55175

// FIXME gcc accepts some relocatable values here too, but only in certain

55176

// memory models; it's complicated.

55177

return;

55178

}

55179

case 'i': {

55180

// Literal immediates are always ok.

55181

if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {

55182

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

55183

BooleanContent BCont = getBooleanContents(MVT::i64);

55184

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

55185

: ISD::SIGN_EXTEND;

55186

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

55187

: CST->getSExtValue();

55188

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

55189

break;

55190

}

55191

55192

// In any sort of PIC mode addresses need to be computed at runtime by

55193

// adding in a register or some sort of table lookup. These can't

55194

// be used as immediates. BlockAddresses are fine though.

55195

if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

55196

!isa<BlockAddressSDNode>(Op))

55197

return;

55198

55199

// If we are in non-pic codegen mode, we allow the address of a global (with

55200

// an optional displacement) to be used with 'i'.

55201

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

55202

// If we require an extra load to get this address, as in PIC mode, we

55203

// can't accept it.

55204

if (isGlobalStubReference(

55205

Subtarget.classifyGlobalReference(GA->getGlobal())))

55206

return;

55207

break;

55208

}

55209

}

55210

55211

if (Result.getNode()) {

55212

Ops.push_back(Result);

55213

return;

55214

}

55215

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

55216

}

55217

55218

/// Check if \p RC is a general purpose register class.

55219

/// I.e., GR* or one of their variant.

55220

static bool isGRClass(const TargetRegisterClass &RC) {

55221

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

55222

RC.hasSuperClassEq(&X86::GR16RegClass) ||

55223

RC.hasSuperClassEq(&X86::GR32RegClass) ||

55224

RC.hasSuperClassEq(&X86::GR64RegClass) ||

55225

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

55226

}

55227

55228

/// Check if \p RC is a vector register class.

55229

/// I.e., FR* / VR* or one of their variant.

55230

static bool isFRClass(const TargetRegisterClass &RC) {

55231

return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

55232

RC.hasSuperClassEq(&X86::FR32XRegClass) ||

55233

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

55234

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

55235

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

55236

RC.hasSuperClassEq(&X86::VR512RegClass);

55237

}

55238

55239

/// Check if \p RC is a mask register class.

55240

/// I.e., VK* or one of their variant.

55241

static bool isVKClass(const TargetRegisterClass &RC) {

55242

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

55243

RC.hasSuperClassEq(&X86::VK2RegClass) ||

55244

RC.hasSuperClassEq(&X86::VK4RegClass) ||

55245

RC.hasSuperClassEq(&X86::VK8RegClass) ||

55246

RC.hasSuperClassEq(&X86::VK16RegClass) ||

55247

RC.hasSuperClassEq(&X86::VK32RegClass) ||

55248

RC.hasSuperClassEq(&X86::VK64RegClass);

55249

}

55250

55251

std::pair<unsigned, const TargetRegisterClass *>

55252

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

55253

StringRef Constraint,

55254

MVT VT) const {

55255

// First, see if this is a constraint that directly corresponds to an LLVM

55256

// register class.

55257

if (Constraint.size() == 1) {

55258

// GCC Constraint Letters

55259

switch (Constraint[0]) {

55260

default: break;

55261

// 'A' means [ER]AX + [ER]DX.

55262

case 'A':

55263

if (Subtarget.is64Bit())

55264

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

55265

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55266, __extension__
__PRETTY_FUNCTION__))

55266

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55266, __extension__
__PRETTY_FUNCTION__));

55267

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

55268

55269

// TODO: Slight differences here in allocation order and leaving

55270

// RIP in the class. Do they matter any more here than they do

55271

// in the normal allocation?

55272

case 'k':

55273

if (Subtarget.hasAVX512()) {

55274

if (VT == MVT::i1)

55275

return std::make_pair(0U, &X86::VK1RegClass);

55276

if (VT == MVT::i8)

55277

return std::make_pair(0U, &X86::VK8RegClass);

55278

if (VT == MVT::i16)

55279

return std::make_pair(0U, &X86::VK16RegClass);

55280

}

55281

if (Subtarget.hasBWI()) {

55282

if (VT == MVT::i32)

55283

return std::make_pair(0U, &X86::VK32RegClass);

55284

if (VT == MVT::i64)

55285

return std::make_pair(0U, &X86::VK64RegClass);

55286

}

55287

break;

55288

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

55289

if (Subtarget.is64Bit()) {

55290

if (VT == MVT::i8 || VT == MVT::i1)

55291

return std::make_pair(0U, &X86::GR8RegClass);

55292

if (VT == MVT::i16)

55293

return std::make_pair(0U, &X86::GR16RegClass);

55294

if (VT == MVT::i32 || VT == MVT::f32)

55295

return std::make_pair(0U, &X86::GR32RegClass);

55296

if (VT != MVT::f80 && !VT.isVector())

55297

return std::make_pair(0U, &X86::GR64RegClass);

55298

break;

55299

}

55300

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55301

// 32-bit fallthrough

55302

case 'Q': // Q_REGS

55303

if (VT == MVT::i8 || VT == MVT::i1)

55304

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

55305

if (VT == MVT::i16)

55306

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

55307

if (VT == MVT::i32 || VT == MVT::f32 ||

55308

(!VT.isVector() && !Subtarget.is64Bit()))

55309

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

55310

if (VT != MVT::f80 && !VT.isVector())

55311

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

55312

break;

55313

case 'r': // GENERAL_REGS

55314

case 'l': // INDEX_REGS

55315

if (VT == MVT::i8 || VT == MVT::i1)

55316

return std::make_pair(0U, &X86::GR8RegClass);

55317

if (VT == MVT::i16)

55318

return std::make_pair(0U, &X86::GR16RegClass);

55319

if (VT == MVT::i32 || VT == MVT::f32 ||

55320

(!VT.isVector() && !Subtarget.is64Bit()))

55321

return std::make_pair(0U, &X86::GR32RegClass);

55322

if (VT != MVT::f80 && !VT.isVector())

55323

return std::make_pair(0U, &X86::GR64RegClass);

55324

break;

55325

case 'R': // LEGACY_REGS

55326

if (VT == MVT::i8 || VT == MVT::i1)

55327

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

55328

if (VT == MVT::i16)

55329

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

55330

if (VT == MVT::i32 || VT == MVT::f32 ||

55331

(!VT.isVector() && !Subtarget.is64Bit()))

55332

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

55333

if (VT != MVT::f80 && !VT.isVector())

55334

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

55335

break;

55336

case 'f': // FP Stack registers.

55337

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

55338

// value to the correct fpstack register class.

55339

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

55340

return std::make_pair(0U, &X86::RFP32RegClass);

55341

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

55342

return std::make_pair(0U, &X86::RFP64RegClass);

55343

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

55344

return std::make_pair(0U, &X86::RFP80RegClass);

55345

break;

55346

case 'y': // MMX_REGS if MMX allowed.

55347

if (!Subtarget.hasMMX()) break;

55348

return std::make_pair(0U, &X86::VR64RegClass);

55349

case 'v':

55350

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

55351

if (!Subtarget.hasSSE1()) break;

55352

bool VConstraint = (Constraint[0] == 'v');

55353

55354

switch (VT.SimpleTy) {

55355

default: break;

55356

// Scalar SSE types.

55357

case MVT::f16:

55358

if (VConstraint && Subtarget.hasFP16())

55359

return std::make_pair(0U, &X86::FR16XRegClass);

55360

break;

55361

case MVT::f32:

55362

case MVT::i32:

55363

if (VConstraint && Subtarget.hasVLX())

55364

return std::make_pair(0U, &X86::FR32XRegClass);

55365

return std::make_pair(0U, &X86::FR32RegClass);

55366

case MVT::f64:

55367

case MVT::i64:

55368

if (VConstraint && Subtarget.hasVLX())

55369

return std::make_pair(0U, &X86::FR64XRegClass);

55370

return std::make_pair(0U, &X86::FR64RegClass);

55371

case MVT::i128:

55372

if (Subtarget.is64Bit()) {

55373

if (VConstraint && Subtarget.hasVLX())

55374

return std::make_pair(0U, &X86::VR128XRegClass);

55375

return std::make_pair(0U, &X86::VR128RegClass);

55376

}

55377

break;

55378

// Vector types and fp128.

55379

case MVT::v8f16:

55380

if (!Subtarget.hasFP16())

55381

break;

55382

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55383

case MVT::f128:

55384

case MVT::v16i8:

55385

case MVT::v8i16:

55386

case MVT::v4i32:

55387

case MVT::v2i64:

55388

case MVT::v4f32:

55389

case MVT::v2f64:

55390

if (VConstraint && Subtarget.hasVLX())

55391

return std::make_pair(0U, &X86::VR128XRegClass);

55392

return std::make_pair(0U, &X86::VR128RegClass);

55393

// AVX types.

55394

case MVT::v16f16:

55395

if (!Subtarget.hasFP16())

55396

break;

55397

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55398

case MVT::v32i8:

55399

case MVT::v16i16:

55400

case MVT::v8i32:

55401

case MVT::v4i64:

55402

case MVT::v8f32:

55403

case MVT::v4f64:

55404

if (VConstraint && Subtarget.hasVLX())

55405

return std::make_pair(0U, &X86::VR256XRegClass);

55406

if (Subtarget.hasAVX())

55407

return std::make_pair(0U, &X86::VR256RegClass);

55408

break;

55409

case MVT::v32f16:

55410

if (!Subtarget.hasFP16())

55411

break;

55412

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55413

case MVT::v64i8:

55414

case MVT::v32i16:

55415

case MVT::v8f64:

55416

case MVT::v16f32:

55417

case MVT::v16i32:

55418

case MVT::v8i64:

55419

if (!Subtarget.hasAVX512()) break;

55420

if (VConstraint)

55421

return std::make_pair(0U, &X86::VR512RegClass);

55422

return std::make_pair(0U, &X86::VR512_0_15RegClass);

55423

}

55424

break;

55425

}

55426

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

55427

switch (Constraint[1]) {

55428

default:

55429

break;

55430

case 'i':

55431

case 't':

55432

case '2':

55433

return getRegForInlineAsmConstraint(TRI, "x", VT);

55434

case 'm':

55435

if (!Subtarget.hasMMX()) break;

55436

return std::make_pair(0U, &X86::VR64RegClass);

55437

case 'z':

55438

if (!Subtarget.hasSSE1()) break;

55439

switch (VT.SimpleTy) {

55440

default: break;

55441

// Scalar SSE types.

55442

case MVT::f16:

55443

if (!Subtarget.hasFP16())

55444

break;

55445

return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

55446

case MVT::f32:

55447

case MVT::i32:

55448

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

55449

case MVT::f64:

55450

case MVT::i64:

55451

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

55452

case MVT::v8f16:

55453

if (!Subtarget.hasFP16())

55454

break;

55455

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55456

case MVT::f128:

55457

case MVT::v16i8:

55458

case MVT::v8i16:

55459

case MVT::v4i32:

55460

case MVT::v2i64:

55461

case MVT::v4f32:

55462

case MVT::v2f64:

55463

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

55464

// AVX types.

55465

case MVT::v16f16:

55466

if (!Subtarget.hasFP16())

55467

break;

55468

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55469

case MVT::v32i8:

55470

case MVT::v16i16:

55471

case MVT::v8i32:

55472

case MVT::v4i64:

55473

case MVT::v8f32:

55474

case MVT::v4f64:

55475

if (Subtarget.hasAVX())

55476

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

55477

break;

55478

case MVT::v32f16:

55479

if (!Subtarget.hasFP16())

55480

break;

55481

LLVM_FALLTHROUGH[[gnu::fallthrough]];

55482

case MVT::v64i8:

55483

case MVT::v32i16:

55484

case MVT::v8f64:

55485

case MVT::v16f32:

55486

case MVT::v16i32:

55487

case MVT::v8i64:

55488

if (Subtarget.hasAVX512())

55489

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

55490

break;

55491

}

55492

break;

55493

case 'k':

55494

// This register class doesn't allocate k0 for masked vector operation.

55495

if (Subtarget.hasAVX512()) {

55496

if (VT == MVT::i1)

55497

return std::make_pair(0U, &X86::VK1WMRegClass);

55498

if (VT == MVT::i8)

55499

return std::make_pair(0U, &X86::VK8WMRegClass);

55500

if (VT == MVT::i16)

55501

return std::make_pair(0U, &X86::VK16WMRegClass);

55502

}

55503

if (Subtarget.hasBWI()) {

55504

if (VT == MVT::i32)

55505

return std::make_pair(0U, &X86::VK32WMRegClass);

55506

if (VT == MVT::i64)

55507

return std::make_pair(0U, &X86::VK64WMRegClass);

55508

}

55509

break;

55510

}

55511

}

55512

55513

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

55514

return std::make_pair(0U, &X86::GR32RegClass);

55515

55516

// Use the default implementation in TargetLowering to convert the register

55517

// constraint into a member of a register class.

55518

std::pair<Register, const TargetRegisterClass*> Res;

55519

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

55520

55521

// Not found as a standard register?

55522

if (!Res.second) {

55523

// Only match x87 registers if the VT is one SelectionDAGBuilder can convert

55524

// to/from f80.

55525

if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

55526

// Map st(0) -> st(7) -> ST0

55527

if (Constraint.size() == 7 && Constraint[0] == '{' &&

55528

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

55529

Constraint[3] == '(' &&

55530

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

55531

Constraint[5] == ')' && Constraint[6] == '}') {

55532

// st(7) is not allocatable and thus not a member of RFP80. Return

55533

// singleton class in cases where we have a reference to it.

55534

if (Constraint[4] == '7')

55535

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

55536

return std::make_pair(X86::FP0 + Constraint[4] - '0',

55537

&X86::RFP80RegClass);

55538

}

55539

55540

// GCC allows "st(0)" to be called just plain "st".

55541

if (StringRef("{st}").equals_insensitive(Constraint))

55542

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

55543

}

55544

55545

// flags -> EFLAGS

55546

if (StringRef("{flags}").equals_insensitive(Constraint))

55547

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

55548

55549

// dirflag -> DF

55550

// Only allow for clobber.

55551

if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

55552

VT == MVT::Other)

55553

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

55554

55555

// fpsr -> FPSW

55556

if (StringRef("{fpsr}").equals_insensitive(Constraint))

55557

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

55558

55559

return Res;

55560

}

55561

55562

// Make sure it isn't a register that requires 64-bit mode.

55563

if (!Subtarget.is64Bit() &&

55564

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

55565

TRI->getEncodingValue(Res.first) >= 8) {

55566

// Register requires REX prefix, but we're in 32-bit mode.

55567

return std::make_pair(0, nullptr);

55568

}

55569

55570

// Make sure it isn't a register that requires AVX512.

55571

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

55572

TRI->getEncodingValue(Res.first) & 0x10) {

55573

// Register requires EVEX prefix.

55574

return std::make_pair(0, nullptr);

55575

}

55576

55577

// Otherwise, check to see if this is a register class of the wrong value

55578

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

55579

// turn into {ax},{dx}.

55580

// MVT::Other is used to specify clobber names.

55581

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

55582

return Res; // Correct type already, nothing to do.

55583

55584

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

55585

// return "eax". This should even work for things like getting 64bit integer

55586

// registers when given an f64 type.

55587

const TargetRegisterClass *Class = Res.second;

55588

// The generic code will match the first register class that contains the

55589

// given register. Thus, based on the ordering of the tablegened file,

55590

// the "plain" GR classes might not come first.

55591

// Therefore, use a helper method.

55592

if (isGRClass(*Class)) {

55593

unsigned Size = VT.getSizeInBits();

55594

if (Size == 1) Size = 8;

55595

Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);

55596

if (DestReg > 0) {

55597

bool is64Bit = Subtarget.is64Bit();

55598

const TargetRegisterClass *RC =

55599

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

55600

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

55601

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

55602

: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)

55603

: nullptr;

55604

if (Size == 64 && !is64Bit) {

55605

// Model GCC's behavior here and select a fixed pair of 32-bit

55606

// registers.

55607

switch (DestReg) {

55608

case X86::RAX:

55609

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

55610

case X86::RDX:

55611

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

55612

case X86::RCX:

55613

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

55614

case X86::RBX:

55615

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

55616

case X86::RSI:

55617

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

55618

case X86::RDI:

55619

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

55620

case X86::RBP:

55621

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

55622

default:

55623

return std::make_pair(0, nullptr);

55624

}

55625

}

55626

if (RC && RC->contains(DestReg))

55627

return std::make_pair(DestReg, RC);

55628

return Res;

55629

}

55630

// No register found/type mismatch.

55631

return std::make_pair(0, nullptr);

55632

} else if (isFRClass(*Class)) {

55633

// Handle references to XMM physical registers that got mapped into the

55634

// wrong class. This can happen with constraints like {xmm0} where the

55635

// target independent register mapper will just pick the first match it can

55636

// find, ignoring the required type.

55637

55638

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

55639

if (VT == MVT::f16)

55640

Res.second = &X86::FR16XRegClass;

55641

else if (VT == MVT::f32 || VT == MVT::i32)

55642

Res.second = &X86::FR32XRegClass;

55643

else if (VT == MVT::f64 || VT == MVT::i64)

55644

Res.second = &X86::FR64XRegClass;

55645

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

55646

Res.second = &X86::VR128XRegClass;

55647

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

55648

Res.second = &X86::VR256XRegClass;

55649

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

55650

Res.second = &X86::VR512RegClass;

55651

else {

55652

// Type mismatch and not a clobber: Return an error;

55653

Res.first = 0;

55654

Res.second = nullptr;

55655

}

55656

} else if (isVKClass(*Class)) {

55657

if (VT == MVT::i1)

55658

Res.second = &X86::VK1RegClass;

55659

else if (VT == MVT::i8)

55660

Res.second = &X86::VK8RegClass;

55661

else if (VT == MVT::i16)

55662

Res.second = &X86::VK16RegClass;

55663

else if (VT == MVT::i32)

55664

Res.second = &X86::VK32RegClass;

55665

else if (VT == MVT::i64)

55666

Res.second = &X86::VK64RegClass;

55667

else {

55668

// Type mismatch and not a clobber: Return an error;

55669

Res.first = 0;

55670

Res.second = nullptr;

55671

}

55672

}

55673

55674

return Res;

55675

}

55676

55677

InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,

55678

const AddrMode &AM,

55679

Type *Ty,

55680

unsigned AS) const {

55681

// Scaling factors are not free at all.

55682

// An indexed folded instruction, i.e., inst (reg1, reg2, scale),

55683

// will take 2 allocations in the out of order engine instead of 1

55684

// for plain addressing mode, i.e. inst (reg1).

55685

// E.g.,

55686

// vaddps (%rsi,%rdx), %ymm0, %ymm1

55687

// Requires two allocations (one for the load, one for the computation)

55688

// whereas:

55689

// vaddps (%rsi), %ymm0, %ymm1

55690

// Requires just 1 allocation, i.e., freeing allocations for other operations

55691

// and having less micro operations to execute.

55692

//

55693

// For some X86 architectures, this is even worse because for instance for

55694

// stores, the complex addressing mode forces the instruction to use the

55695

// "load" ports instead of the dedicated "store" port.

55696

// E.g., on Haswell:

55697

// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.

55698

// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.

55699

if (isLegalAddressingMode(DL, AM, Ty, AS))

55700

// Scale represents reg2 * scale, thus account for 1

55701

// as soon as we use a second register.

55702

return AM.Scale != 0;

55703

return -1;

55704

}

55705

55706

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

55707

// Integer division on x86 is expensive. However, when aggressively optimizing

55708

// for code size, we prefer to use a div instruction, as it is usually smaller

55709

// than the alternative sequence.

55710

// The exception to this is vector division. Since x86 doesn't have vector

55711

// integer division, leaving the division as-is is a loss even in terms of

55712

// size, because it will have to be scalarized, while the alternative code

55713

// sequence can be performed in vector form.

55714

bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

55715

return OptSize && !VT.isVector();

55716

}

55717

55718

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

55719

if (!Subtarget.is64Bit())

55720

return;

55721

55722

// Update IsSplitCSR in X86MachineFunctionInfo.

55723

X86MachineFunctionInfo *AFI =

55724

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

55725

AFI->setIsSplitCSR(true);

55726

}

55727

55728

void X86TargetLowering::insertCopiesSplitCSR(

55729

MachineBasicBlock *Entry,

55730

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

55731

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

55732

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

55733

if (!IStart)

55734

return;

55735

55736

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

55737

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

55738

MachineBasicBlock::iterator MBBI = Entry->begin();

55739

for (const MCPhysReg *I = IStart; *I; ++I) {

55740

const TargetRegisterClass *RC = nullptr;

55741

if (X86::GR64RegClass.contains(*I))

55742

RC = &X86::GR64RegClass;

55743

else

55744

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55744);

55745

55746

Register NewVR = MRI->createVirtualRegister(RC);

55747

// Create copy from CSR to a virtual register.

55748

// FIXME: this currently does not emit CFI pseudo-instructions, it works

55749

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

55750

// nounwind. If we want to generalize this later, we may need to emit

55751

// CFI pseudo-instructions.

55752

assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))

55753

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__))

55754

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55754, __extension__
__PRETTY_FUNCTION__));

55755

Entry->addLiveIn(*I);

55756

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

55757

.addReg(*I);

55758

55759

// Insert the copy-back instructions right before the terminator.

55760

for (auto *Exit : Exits)

55761

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

55762

TII->get(TargetOpcode::COPY), *I)

55763

.addReg(NewVR);

55764

}

55765

}

55766

55767

bool X86TargetLowering::supportSwiftError() const {

55768

return Subtarget.is64Bit();

55769

}

55770

55771

/// Returns true if stack probing through a function call is requested.

55772

bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {

55773

return !getStackProbeSymbolName(MF).empty();

55774

}

55775

55776

/// Returns true if stack probing through inline assembly is requested.

55777

bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {

55778

55779

// No inline stack probe for Windows, they have their own mechanism.

55780

if (Subtarget.isOSWindows() ||

55781

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

55782

return false;

55783

55784

// If the function specifically requests inline stack probes, emit them.

55785

if (MF.getFunction().hasFnAttribute("probe-stack"))

55786

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

55787

"inline-asm";

55788

55789

return false;

55790

}

55791

55792

/// Returns the name of the symbol used to emit stack probes or the empty

55793

/// string if not applicable.

55794

StringRef

55795

X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {

55796

// Inline Stack probes disable stack probe call

55797

if (hasInlineStackProbe(MF))

55798

return "";

55799

55800

// If the function specifically requests stack probes, emit them.

55801

if (MF.getFunction().hasFnAttribute("probe-stack"))

55802

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

55803

55804

// Generally, if we aren't on Windows, the platform ABI does not include

55805

// support for stack probes, so don't emit them.

55806

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

55807

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

55808

return "";

55809

55810

// We need a stack probe to conform to the Windows ABI. Choose the right

55811

// symbol.

55812

if (Subtarget.is64Bit())

55813

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

55814

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

55815

}

55816

55817

unsigned

55818

X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {

55819

// The default stack probe size is 4096 if the function has no stackprobesize

55820

// attribute.

55821

unsigned StackProbeSize = 4096;

55822

const Function &Fn = MF.getFunction();

55823

if (Fn.hasFnAttribute("stack-probe-size"))

55824

Fn.getFnAttribute("stack-probe-size")

55825

.getValueAsString()

55826

.getAsInteger(0, StackProbeSize);

55827

return StackProbeSize;

55828

}

55829

55830

Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

55831

if (ML->isInnermost() &&

55832

ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

55833

return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

55834

return TargetLowering::getPrefLoopAlignment();

55835

}

File:	build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 10227, column 35 Division by zero

Bug Summary

Annotated Source Code