/build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/ObjCARCUtil.h"

31

#include "llvm/Analysis/ProfileSummaryInfo.h"

32

#include "llvm/Analysis/VectorUtils.h"

33

#include "llvm/CodeGen/IntrinsicLowering.h"

34

#include "llvm/CodeGen/MachineFrameInfo.h"

35

#include "llvm/CodeGen/MachineFunction.h"

36

#include "llvm/CodeGen/MachineInstrBuilder.h"

37

#include "llvm/CodeGen/MachineJumpTableInfo.h"

38

#include "llvm/CodeGen/MachineLoopInfo.h"

39

#include "llvm/CodeGen/MachineModuleInfo.h"

40

#include "llvm/CodeGen/MachineRegisterInfo.h"

41

#include "llvm/CodeGen/TargetLowering.h"

42

#include "llvm/CodeGen/WinEHFuncInfo.h"

43

#include "llvm/IR/CallingConv.h"

44

#include "llvm/IR/Constants.h"

45

#include "llvm/IR/DerivedTypes.h"

46

#include "llvm/IR/DiagnosticInfo.h"

47

#include "llvm/IR/EHPersonalities.h"

48

#include "llvm/IR/Function.h"

49

#include "llvm/IR/GlobalAlias.h"

50

#include "llvm/IR/GlobalVariable.h"

51

#include "llvm/IR/IRBuilder.h"

52

#include "llvm/IR/Instructions.h"

53

#include "llvm/IR/Intrinsics.h"

54

#include "llvm/IR/PatternMatch.h"

55

#include "llvm/MC/MCAsmInfo.h"

56

#include "llvm/MC/MCContext.h"

57

#include "llvm/MC/MCExpr.h"

58

#include "llvm/MC/MCSymbol.h"

59

#include "llvm/Support/CommandLine.h"

60

#include "llvm/Support/Debug.h"

61

#include "llvm/Support/ErrorHandling.h"

62

#include "llvm/Support/KnownBits.h"

63

#include "llvm/Support/MathExtras.h"

64

#include "llvm/Target/TargetOptions.h"

65

#include <algorithm>

66

#include <bitset>

67

#include <cctype>

68

#include <numeric>

69

using namespace llvm;

70

71

#define DEBUG_TYPE"x86-isel" "x86-isel"

72

73

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

74

75

static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

76

"x86-experimental-pref-innermost-loop-alignment", cl::init(4),

77

cl::desc(

78

"Sets the preferable loop alignment for experiments (as log2 bytes) "

79

"for innermost loops only. If specified, this option overrides "

80

"alignment set by x86-experimental-pref-loop-alignment."),

81

cl::Hidden);

82

83

static cl::opt<bool> MulConstantOptimization(

84

"mul-constant-optimization", cl::init(true),

85

cl::desc("Replace 'mul x, Const' with more effective instructions like "

86

"SHIFT, LEA, etc."),

87

cl::Hidden);

88

89

static cl::opt<bool> ExperimentalUnorderedISEL(

90

"x86-experimental-unordered-atomic-isel", cl::init(false),

91

cl::desc("Use LoadSDNode and StoreSDNode instead of "

92

"AtomicSDNode for unordered atomic loads and "

93

"stores respectively."),

94

cl::Hidden);

95

96

/// Call this when the user attempts to do something unsupported, like

97

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

98

/// report_fatal_error, so calling code should attempt to recover without

99

/// crashing.

100

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

101

const char *Msg) {

102

MachineFunction &MF = DAG.getMachineFunction();

103

DAG.getContext()->diagnose(

104

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

105

}

106

107

/// Returns true if a CC can dynamically exclude a register from the list of

108

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

109

/// the return registers.

110

static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {

111

switch (CC) {

112

default:

113

return false;

114

case CallingConv::X86_RegCall:

115

case CallingConv::PreserveMost:

116

case CallingConv::PreserveAll:

117

return true;

118

}

119

}

120

121

/// Returns true if a CC can dynamically exclude a register from the list of

122

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

123

/// the parameters.

124

static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {

125

return CC == CallingConv::X86_RegCall;

126

}

127

128

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

129

const X86Subtarget &STI)

130

: TargetLowering(TM), Subtarget(STI) {

131

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

132

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

133

134

// Set up the TargetLowering object.

135

136

// X86 is weird. It always uses i8 for shift amounts and setcc results.

137

setBooleanContents(ZeroOrOneBooleanContent);

138

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

139

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

140

141

// For 64-bit, since we have so many registers, use the ILP scheduler.

142

// For 32-bit, use the register pressure specific scheduling.

143

// For Atom, always use ILP scheduling.

144

if (Subtarget.isAtom())

145

setSchedulingPreference(Sched::ILP);

146

else if (Subtarget.is64Bit())

147

setSchedulingPreference(Sched::ILP);

148

else

149

setSchedulingPreference(Sched::RegPressure);

150

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

151

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

152

153

// Bypass expensive divides and use cheaper ones.

154

if (TM.getOptLevel() >= CodeGenOpt::Default) {

155

if (Subtarget.hasSlowDivide32())

156

addBypassSlowDiv(32, 8);

157

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

158

addBypassSlowDiv(64, 32);

159

}

160

161

// Setup Windows compiler runtime calls.

162

if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {

163

static const struct {

164

const RTLIB::Libcall Op;

165

const char * const Name;

166

const CallingConv::ID CC;

167

} LibraryCalls[] = {

168

{ RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },

169

{ RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },

170

{ RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },

171

{ RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },

172

{ RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },

173

};

174

175

for (const auto &LC : LibraryCalls) {

176

setLibcallName(LC.Op, LC.Name);

177

setLibcallCallingConv(LC.Op, LC.CC);

178

}

179

}

180

181

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

182

// MSVCRT doesn't have powi; fall back to pow

183

setLibcallName(RTLIB::POWI_F32, nullptr);

184

setLibcallName(RTLIB::POWI_F64, nullptr);

185

}

186

187

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

188

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

189

// FIXME: Should we be limiting the atomic size on other configs? Default is

190

// 1024.

191

if (!Subtarget.canUseCMPXCHG8B())

192

setMaxAtomicSizeInBitsSupported(32);

193

194

setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);

195

196

setMaxLargeFPConvertBitWidthSupported(128);

197

198

// Set up the register classes.

199

addRegisterClass(MVT::i8, &X86::GR8RegClass);

200

addRegisterClass(MVT::i16, &X86::GR16RegClass);

201

addRegisterClass(MVT::i32, &X86::GR32RegClass);

202

if (Subtarget.is64Bit())

203

addRegisterClass(MVT::i64, &X86::GR64RegClass);

204

205

for (MVT VT : MVT::integer_valuetypes())

206

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

207

208

// We don't accept any truncstore of integer registers.

209

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

210

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

211

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

212

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

213

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

214

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

215

216

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

217

218

// SETOEQ and SETUNE require checking two conditions.

219

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

220

setCondCodeAction(ISD::SETOEQ, VT, Expand);

221

setCondCodeAction(ISD::SETUNE, VT, Expand);

222

}

223

224

// Integer absolute.

225

if (Subtarget.canUseCMOV()) {

226

setOperationAction(ISD::ABS , MVT::i16 , Custom);

227

setOperationAction(ISD::ABS , MVT::i32 , Custom);

228

if (Subtarget.is64Bit())

229

setOperationAction(ISD::ABS , MVT::i64 , Custom);

230

}

231

232

// Signed saturation subtraction.

233

setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);

234

setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);

235

setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);

236

if (Subtarget.is64Bit())

237

setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);

238

239

// Funnel shifts.

240

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

241

// For slow shld targets we only lower for code size.

242

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

243

244

setOperationAction(ShiftOp , MVT::i8 , Custom);

245

setOperationAction(ShiftOp , MVT::i16 , Custom);

246

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

247

if (Subtarget.is64Bit())

248

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

249

}

250

251

if (!Subtarget.useSoftFloat()) {

252

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

253

// operation.

254

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

255

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

256

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

257

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

258

// We have an algorithm for SSE2, and we turn this into a 64-bit

259

// FILD or VCVTUSI2SS/SD for other targets.

260

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

261

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

262

// We have an algorithm for SSE2->double, and we turn this into a

263

// 64-bit FILD followed by conditional FADD for other targets.

264

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

265

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

266

267

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

268

// this operation.

269

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

270

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

271

// SSE has no i16 to fp conversion, only i32. We promote in the handler

272

// to allow f80 to use i16 and f64 to use i16 with sse1 only

273

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

274

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

275

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

276

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

277

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

278

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

279

// are Legal, f80 is custom lowered.

280

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

281

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

282

283

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

284

// this operation.

285

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

286

// FIXME: This doesn't generate invalid exception when it should. PR44019.

287

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

288

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

289

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

290

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

291

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

292

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

293

// are Legal, f80 is custom lowered.

294

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

295

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

296

297

// Handle FP_TO_UINT by promoting the destination to a larger signed

298

// conversion.

299

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

300

// FIXME: This doesn't generate invalid exception when it should. PR44019.

301

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

302

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

303

// FIXME: This doesn't generate invalid exception when it should. PR44019.

304

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

305

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

306

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

307

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

308

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

309

310

setOperationAction(ISD::LRINT, MVT::f32, Custom);

311

setOperationAction(ISD::LRINT, MVT::f64, Custom);

312

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

313

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

314

315

if (!Subtarget.is64Bit()) {

316

setOperationAction(ISD::LRINT, MVT::i64, Custom);

317

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

318

}

319

}

320

321

if (Subtarget.hasSSE2()) {

322

// Custom lowering for saturating float to int conversions.

323

// We handle promotion to larger result types manually.

324

for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

325

setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

326

setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

327

}

328

if (Subtarget.is64Bit()) {

329

setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

330

setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

331

}

332

}

333

334

// Handle address space casts between mixed sized pointers.

335

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

336

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

337

338

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

339

if (!Subtarget.hasSSE2()) {

340

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

341

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

342

if (Subtarget.is64Bit()) {

343

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

344

// Without SSE, i64->f64 goes through memory.

345

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

346

}

347

} else if (!Subtarget.is64Bit())

348

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

349

350

// Scalar integer divide and remainder are lowered to use operations that

351

// produce two results, to match the available instructions. This exposes

352

// the two-result form to trivial CSE, which is able to combine x/y and x%y

353

// into a single instruction.

354

//

355

// Scalar integer multiply-high is also lowered to use two-result

356

// operations, to match the available instructions. However, plain multiply

357

// (low) operations are left as Legal, as there are single-result

358

// instructions for this in x86. Using the two-result multiply instructions

359

// when both high and low results are needed must be arranged by dagcombine.

360

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

361

setOperationAction(ISD::MULHS, VT, Expand);

362

setOperationAction(ISD::MULHU, VT, Expand);

363

setOperationAction(ISD::SDIV, VT, Expand);

364

setOperationAction(ISD::UDIV, VT, Expand);

365

setOperationAction(ISD::SREM, VT, Expand);

366

setOperationAction(ISD::UREM, VT, Expand);

367

}

368

369

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

370

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

371

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

372

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

373

setOperationAction(ISD::BR_CC, VT, Expand);

374

setOperationAction(ISD::SELECT_CC, VT, Expand);

375

}

376

if (Subtarget.is64Bit())

377

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

378

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

379

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

380

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

381

382

setOperationAction(ISD::FREM , MVT::f32 , Expand);

383

setOperationAction(ISD::FREM , MVT::f64 , Expand);

384

setOperationAction(ISD::FREM , MVT::f80 , Expand);

385

setOperationAction(ISD::FREM , MVT::f128 , Expand);

386

387

if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

388

setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);

389

setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);

390

}

391

392

// Promote the i8 variants and force them on up to i32 which has a shorter

393

// encoding.

394

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

395

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

396

// Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit

397

// a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to

398

// promote that too.

399

setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);

400

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);

401

402

if (!Subtarget.hasBMI()) {

403

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

404

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

405

if (Subtarget.is64Bit()) {

406

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

407

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

408

}

409

}

410

411

if (Subtarget.hasLZCNT()) {

412

// When promoting the i8 variants, force them to i32 for a shorter

413

// encoding.

414

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

415

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

416

} else {

417

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

418

if (VT == MVT::i64 && !Subtarget.is64Bit())

419

continue;

420

setOperationAction(ISD::CTLZ , VT, Custom);

421

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

422

}

423

}

424

425

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

426

ISD::STRICT_FP_TO_FP16}) {

427

// Special handling for half-precision floating point conversions.

428

// If we don't have F16C support, then lower half float conversions

429

// into library calls.

430

setOperationAction(

431

Op, MVT::f32,

432

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

433

// There's never any support for operations beyond MVT::f32.

434

setOperationAction(Op, MVT::f64, Expand);

435

setOperationAction(Op, MVT::f80, Expand);

436

setOperationAction(Op, MVT::f128, Expand);

437

}

438

439

for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

440

setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

441

setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

442

setTruncStoreAction(VT, MVT::f16, Expand);

443

setTruncStoreAction(VT, MVT::bf16, Expand);

444

445

setOperationAction(ISD::BF16_TO_FP, VT, Expand);

446

setOperationAction(ISD::FP_TO_BF16, VT, Custom);

447

}

448

449

setOperationAction(ISD::PARITY, MVT::i8, Custom);

450

setOperationAction(ISD::PARITY, MVT::i16, Custom);

451

setOperationAction(ISD::PARITY, MVT::i32, Custom);

452

if (Subtarget.is64Bit())

453

setOperationAction(ISD::PARITY, MVT::i64, Custom);

454

if (Subtarget.hasPOPCNT()) {

455

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

456

// popcntw is longer to encode than popcntl and also has a false dependency

457

// on the dest that popcntl hasn't had since Cannon Lake.

458

setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

459

} else {

460

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

461

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

462

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

463

if (Subtarget.is64Bit())

464

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

465

else

466

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

467

}

468

469

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

470

471

if (!Subtarget.hasMOVBE())

472

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

473

474

// X86 wants to expand cmov itself.

475

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

476

setOperationAction(ISD::SELECT, VT, Custom);

477

setOperationAction(ISD::SETCC, VT, Custom);

478

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

479

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

480

}

481

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

482

if (VT == MVT::i64 && !Subtarget.is64Bit())

483

continue;

484

setOperationAction(ISD::SELECT, VT, Custom);

485

setOperationAction(ISD::SETCC, VT, Custom);

486

}

487

488

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

489

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

490

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

491

492

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

493

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

494

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

495

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

496

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

497

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

498

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

499

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

500

501

// Darwin ABI issue.

502

for (auto VT : { MVT::i32, MVT::i64 }) {

503

if (VT == MVT::i64 && !Subtarget.is64Bit())

504

continue;

505

setOperationAction(ISD::ConstantPool , VT, Custom);

506

setOperationAction(ISD::JumpTable , VT, Custom);

507

setOperationAction(ISD::GlobalAddress , VT, Custom);

508

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

509

setOperationAction(ISD::ExternalSymbol , VT, Custom);

510

setOperationAction(ISD::BlockAddress , VT, Custom);

511

}

512

513

// 64-bit shl, sra, srl (iff 32-bit x86)

514

for (auto VT : { MVT::i32, MVT::i64 }) {

515

if (VT == MVT::i64 && !Subtarget.is64Bit())

516

continue;

517

setOperationAction(ISD::SHL_PARTS, VT, Custom);

518

setOperationAction(ISD::SRA_PARTS, VT, Custom);

519

setOperationAction(ISD::SRL_PARTS, VT, Custom);

520

}

521

522

if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())

523

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

524

525

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

526

527

// Expand certain atomics

528

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

529

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

530

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

531

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

532

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

533

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

534

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

535

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

536

}

537

538

if (!Subtarget.is64Bit())

539

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

540

541

if (Subtarget.canUseCMPXCHG16B())

542

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

543

544

// FIXME - use subtarget debug flags

545

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

546

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

547

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

548

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

549

}

550

551

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

552

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

553

554

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

555

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

556

557

setOperationAction(ISD::TRAP, MVT::Other, Legal);

558

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

559

if (Subtarget.isTargetPS())

560

setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

561

else

562

setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);

563

564

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

565

setOperationAction(ISD::VASTART , MVT::Other, Custom);

566

setOperationAction(ISD::VAEND , MVT::Other, Expand);

567

bool Is64Bit = Subtarget.is64Bit();

568

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

569

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

570

571

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

572

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

573

574

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

575

576

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

577

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

578

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

579

580

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

581

582

auto setF16Action = [&] (MVT VT, LegalizeAction Action) {

583

setOperationAction(ISD::FABS, VT, Action);

584

setOperationAction(ISD::FNEG, VT, Action);

585

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

586

setOperationAction(ISD::FREM, VT, Action);

587

setOperationAction(ISD::FMA, VT, Action);

588

setOperationAction(ISD::FMINNUM, VT, Action);

589

setOperationAction(ISD::FMAXNUM, VT, Action);

590

setOperationAction(ISD::FMINIMUM, VT, Action);

591

setOperationAction(ISD::FMAXIMUM, VT, Action);

592

setOperationAction(ISD::FSIN, VT, Action);

593

setOperationAction(ISD::FCOS, VT, Action);

594

setOperationAction(ISD::FSINCOS, VT, Action);

595

setOperationAction(ISD::FSQRT, VT, Action);

596

setOperationAction(ISD::FPOW, VT, Action);

597

setOperationAction(ISD::FLOG, VT, Action);

598

setOperationAction(ISD::FLOG2, VT, Action);

599

setOperationAction(ISD::FLOG10, VT, Action);

600

setOperationAction(ISD::FEXP, VT, Action);

601

setOperationAction(ISD::FEXP2, VT, Action);

602

setOperationAction(ISD::FCEIL, VT, Action);

603

setOperationAction(ISD::FFLOOR, VT, Action);

604

setOperationAction(ISD::FNEARBYINT, VT, Action);

605

setOperationAction(ISD::FRINT, VT, Action);

606

setOperationAction(ISD::BR_CC, VT, Action);

607

setOperationAction(ISD::SETCC, VT, Action);

608

setOperationAction(ISD::SELECT, VT, Custom);

609

setOperationAction(ISD::SELECT_CC, VT, Action);

610

setOperationAction(ISD::FROUND, VT, Action);

611

setOperationAction(ISD::FROUNDEVEN, VT, Action);

612

setOperationAction(ISD::FTRUNC, VT, Action);

613

};

614

615

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

616

// f16, f32 and f64 use SSE.

617

// Set up the FP register classes.

618

addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass

619

: &X86::FR16RegClass);

620

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

621

: &X86::FR32RegClass);

622

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

623

: &X86::FR64RegClass);

624

625

// Disable f32->f64 extload as we can only generate this in one instruction

626

// under optsize. So its easier to pattern match (fpext (load)) for that

627

// case instead of needing to emit 2 instructions for extload in the

628

// non-optsize case.

629

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

630

631

for (auto VT : { MVT::f32, MVT::f64 }) {

632

// Use ANDPD to simulate FABS.

633

setOperationAction(ISD::FABS, VT, Custom);

634

635

// Use XORP to simulate FNEG.

636

setOperationAction(ISD::FNEG, VT, Custom);

637

638

// Use ANDPD and ORPD to simulate FCOPYSIGN.

639

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

640

641

// These might be better off as horizontal vector ops.

642

setOperationAction(ISD::FADD, VT, Custom);

643

setOperationAction(ISD::FSUB, VT, Custom);

644

645

// We don't support sin/cos/fmod

646

setOperationAction(ISD::FSIN , VT, Expand);

647

setOperationAction(ISD::FCOS , VT, Expand);

648

setOperationAction(ISD::FSINCOS, VT, Expand);

649

}

650

651

// Half type will be promoted by default.

652

setF16Action(MVT::f16, Promote);

653

setOperationAction(ISD::FADD, MVT::f16, Promote);

654

setOperationAction(ISD::FSUB, MVT::f16, Promote);

655

setOperationAction(ISD::FMUL, MVT::f16, Promote);

656

setOperationAction(ISD::FDIV, MVT::f16, Promote);

657

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

658

setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

659

setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);

660

661

setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);

662

setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);

663

setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);

664

setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);

665

setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);

666

setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);

667

setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);

668

setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);

669

setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);

670

setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);

671

setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);

672

setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);

673

setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);

674

setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);

675

setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);

676

setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);

677

setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);

678

setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);

679

setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);

680

setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);

681

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);

682

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);

683

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

684

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);

685

setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);

686

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

687

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);

688

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);

689

690

setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

691

setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");

692

693

// Lower this to MOVMSK plus an AND.

694

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

695

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

696

697

} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

698

(UseX87 || Is64Bit)) {

699

// Use SSE for f32, x87 for f64.

700

// Set up the FP register classes.

701

addRegisterClass(MVT::f32, &X86::FR32RegClass);

702

if (UseX87)

703

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

704

705

// Use ANDPS to simulate FABS.

706

setOperationAction(ISD::FABS , MVT::f32, Custom);

707

708

// Use XORP to simulate FNEG.

709

setOperationAction(ISD::FNEG , MVT::f32, Custom);

710

711

if (UseX87)

712

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

713

714

// Use ANDPS and ORPS to simulate FCOPYSIGN.

715

if (UseX87)

716

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

717

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

718

719

// We don't support sin/cos/fmod

720

setOperationAction(ISD::FSIN , MVT::f32, Expand);

721

setOperationAction(ISD::FCOS , MVT::f32, Expand);

722

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

723

724

if (UseX87) {

725

// Always expand sin/cos functions even though x87 has an instruction.

726

setOperationAction(ISD::FSIN, MVT::f64, Expand);

727

setOperationAction(ISD::FCOS, MVT::f64, Expand);

728

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

729

}

730

} else if (UseX87) {

731

// f32 and f64 in x87.

732

// Set up the FP register classes.

733

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

734

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

735

736

for (auto VT : { MVT::f32, MVT::f64 }) {

737

setOperationAction(ISD::UNDEF, VT, Expand);

738

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

739

740

// Always expand sin/cos functions even though x87 has an instruction.

741

setOperationAction(ISD::FSIN , VT, Expand);

742

setOperationAction(ISD::FCOS , VT, Expand);

743

setOperationAction(ISD::FSINCOS, VT, Expand);

744

}

745

}

746

747

// Expand FP32 immediates into loads from the stack, save special cases.

748

if (isTypeLegal(MVT::f32)) {

749

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

750

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

751

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

752

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

753

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

754

} else // SSE immediates.

755

addLegalFPImmediate(APFloat(+0.0f)); // xorps

756

}

757

// Expand FP64 immediates into loads from the stack, save special cases.

758

if (isTypeLegal(MVT::f64)) {

759

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

760

addLegalFPImmediate(APFloat(+0.0)); // FLD0

761

addLegalFPImmediate(APFloat(+1.0)); // FLD1

762

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

763

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

764

} else // SSE immediates.

765

addLegalFPImmediate(APFloat(+0.0)); // xorpd

766

}

767

// Support fp16 0 immediate.

768

if (isTypeLegal(MVT::f16))

769

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));

770

771

// Handle constrained floating-point operations of scalar.

772

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

773

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

774

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

775

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

776

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

777

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

778

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

779

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

780

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

781

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

782

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

783

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

784

785

// We don't support FMA.

786

setOperationAction(ISD::FMA, MVT::f64, Expand);

787

setOperationAction(ISD::FMA, MVT::f32, Expand);

788

789

// f80 always uses X87.

790

if (UseX87) {

791

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

792

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

793

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

794

{

795

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

796

addLegalFPImmediate(TmpFlt); // FLD0

797

TmpFlt.changeSign();

798

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

799

800

bool ignored;

801

APFloat TmpFlt2(+1.0);

802

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

803

&ignored);

804

addLegalFPImmediate(TmpFlt2); // FLD1

805

TmpFlt2.changeSign();

806

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

807

}

808

809

// Always expand sin/cos functions even though x87 has an instruction.

810

setOperationAction(ISD::FSIN , MVT::f80, Expand);

811

setOperationAction(ISD::FCOS , MVT::f80, Expand);

812

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

813

814

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

815

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

816

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

817

setOperationAction(ISD::FRINT, MVT::f80, Expand);

818

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

819

setOperationAction(ISD::FMA, MVT::f80, Expand);

820

setOperationAction(ISD::LROUND, MVT::f80, Expand);

821

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

822

setOperationAction(ISD::LRINT, MVT::f80, Custom);

823

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

824

825

// Handle constrained floating-point operations of scalar.

826

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

827

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

828

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

829

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

830

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

831

if (isTypeLegal(MVT::f16)) {

832

setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

833

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

834

} else {

835

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

836

}

837

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

838

// as Custom.

839

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

840

}

841

842

// f128 uses xmm registers, but most operations require libcalls.

843

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

844

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

845

: &X86::VR128RegClass);

846

847

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

848

849

setOperationAction(ISD::FADD, MVT::f128, LibCall);

850

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

851

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

852

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

853

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

854

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

855

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

856

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

857

setOperationAction(ISD::FMA, MVT::f128, LibCall);

858

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

859

860

setOperationAction(ISD::FABS, MVT::f128, Custom);

861

setOperationAction(ISD::FNEG, MVT::f128, Custom);

862

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

863

864

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

865

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

866

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

867

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

868

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

869

// No STRICT_FSINCOS

870

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

871

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

872

873

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

874

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

875

// We need to custom handle any FP_ROUND with an f128 input, but

876

// LegalizeDAG uses the result type to know when to run a custom handler.

877

// So we have to list all legal floating point result types here.

878

if (isTypeLegal(MVT::f32)) {

879

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

880

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

881

}

882

if (isTypeLegal(MVT::f64)) {

883

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

884

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

885

}

886

if (isTypeLegal(MVT::f80)) {

887

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

888

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

889

}

890

891

setOperationAction(ISD::SETCC, MVT::f128, Custom);

892

893

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

894

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

895

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

896

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

897

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

898

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

899

}

900

901

// Always use a library call for pow.

902

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

903

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

904

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

905

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

906

907

setOperationAction(ISD::FLOG, MVT::f80, Expand);

908

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

909

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

910

setOperationAction(ISD::FEXP, MVT::f80, Expand);

911

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

912

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

913

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

914

915

// Some FP actions are always expanded for vector types.

916

for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

917

MVT::v4f32, MVT::v8f32, MVT::v16f32,

918

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

919

setOperationAction(ISD::FSIN, VT, Expand);

920

setOperationAction(ISD::FSINCOS, VT, Expand);

921

setOperationAction(ISD::FCOS, VT, Expand);

922

setOperationAction(ISD::FREM, VT, Expand);

923

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

924

setOperationAction(ISD::FPOW, VT, Expand);

925

setOperationAction(ISD::FLOG, VT, Expand);

926

setOperationAction(ISD::FLOG2, VT, Expand);

927

setOperationAction(ISD::FLOG10, VT, Expand);

928

setOperationAction(ISD::FEXP, VT, Expand);

929

setOperationAction(ISD::FEXP2, VT, Expand);

930

}

931

932

// First set operation action for all vector types to either promote

933

// (for widening) or expand (for scalarization). Then we will selectively

934

// turn on ones that can be effectively codegen'd.

935

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

936

setOperationAction(ISD::SDIV, VT, Expand);

937

setOperationAction(ISD::UDIV, VT, Expand);

938

setOperationAction(ISD::SREM, VT, Expand);

939

setOperationAction(ISD::UREM, VT, Expand);

940

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

941

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

942

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

943

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

944

setOperationAction(ISD::FMA, VT, Expand);

945

setOperationAction(ISD::FFLOOR, VT, Expand);

946

setOperationAction(ISD::FCEIL, VT, Expand);

947

setOperationAction(ISD::FTRUNC, VT, Expand);

948

setOperationAction(ISD::FRINT, VT, Expand);

949

setOperationAction(ISD::FNEARBYINT, VT, Expand);

950

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

951

setOperationAction(ISD::MULHS, VT, Expand);

952

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

953

setOperationAction(ISD::MULHU, VT, Expand);

954

setOperationAction(ISD::SDIVREM, VT, Expand);

955

setOperationAction(ISD::UDIVREM, VT, Expand);

956

setOperationAction(ISD::CTPOP, VT, Expand);

957

setOperationAction(ISD::CTTZ, VT, Expand);

958

setOperationAction(ISD::CTLZ, VT, Expand);

959

setOperationAction(ISD::ROTL, VT, Expand);

960

setOperationAction(ISD::ROTR, VT, Expand);

961

setOperationAction(ISD::BSWAP, VT, Expand);

962

setOperationAction(ISD::SETCC, VT, Expand);

963

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

964

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

965

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

966

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

967

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

968

setOperationAction(ISD::TRUNCATE, VT, Expand);

969

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

970

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

971

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

972

setOperationAction(ISD::SELECT_CC, VT, Expand);

973

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

974

setTruncStoreAction(InnerVT, VT, Expand);

975

976

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

977

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

978

979

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

980

// types, we have to deal with them whether we ask for Expansion or not.

981

// Setting Expand causes its own optimisation problems though, so leave

982

// them legal.

983

if (VT.getVectorElementType() == MVT::i1)

984

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

985

986

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

987

// split/scalarized right now.

988

if (VT.getVectorElementType() == MVT::f16 ||

989

VT.getVectorElementType() == MVT::bf16)

990

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

991

}

992

}

993

994

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

995

// with -msoft-float, disable use of MMX as well.

996

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

997

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

998

// No operations on x86mmx supported, everything uses intrinsics.

999

}

1000

1001

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

1002

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1003

: &X86::VR128RegClass);

1004

1005

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

1006

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

1007

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

1008

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

1009

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

1010

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

1011

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

1012

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

1013

1014

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

1015

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

1016

1017

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

1018

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

1019

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

1020

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

1021

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

1022

}

1023

1024

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

1025

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1026

: &X86::VR128RegClass);

1027

1028

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

1029

// registers cannot be used even for integer operations.

1030

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

1031

: &X86::VR128RegClass);

1032

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1033

: &X86::VR128RegClass);

1034

addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1035

: &X86::VR128RegClass);

1036

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1037

: &X86::VR128RegClass);

1038

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1039

: &X86::VR128RegClass);

1040

1041

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

1042

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

1043

setOperationAction(ISD::SDIV, VT, Custom);

1044

setOperationAction(ISD::SREM, VT, Custom);

1045

setOperationAction(ISD::UDIV, VT, Custom);

1046

setOperationAction(ISD::UREM, VT, Custom);

1047

}

1048

1049

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

1050

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

1051

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

1052

1053

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

1054

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

1055

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

1056

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

1057

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

1058

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

1059

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

1060

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

1061

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

1062

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

1063

setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);

1064

setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);

1065

1066

setOperationAction(ISD::SMULO, MVT::v16i8, Custom);

1067

setOperationAction(ISD::UMULO, MVT::v16i8, Custom);

1068

setOperationAction(ISD::UMULO, MVT::v2i32, Custom);

1069

1070

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

1071

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

1072

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

1073

1074

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1075

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

1076

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

1077

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

1078

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

1079

}

1080

1081

setOperationAction(ISD::ABDU, MVT::v16i8, Custom);

1082

setOperationAction(ISD::ABDS, MVT::v8i16, Custom);

1083

1084

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

1085

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

1086

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

1087

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

1088

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

1089

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

1090

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

1091

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

1092

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

1093

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

1094

1095

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1096

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

1097

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

1098

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

1099

1100

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1101

setOperationAction(ISD::SETCC, VT, Custom);

1102

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1103

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1104

setOperationAction(ISD::CTPOP, VT, Custom);

1105

setOperationAction(ISD::ABS, VT, Custom);

1106

1107

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1108

// setcc all the way to isel and prefer SETGT in some isel patterns.

1109

setCondCodeAction(ISD::SETLT, VT, Custom);

1110

setCondCodeAction(ISD::SETLE, VT, Custom);

1111

}

1112

1113

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

1114

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1115

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1116

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1117

setOperationAction(ISD::VSELECT, VT, Custom);

1118

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1119

}

1120

1121

for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {

1122

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1123

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1124

setOperationAction(ISD::VSELECT, VT, Custom);

1125

1126

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

1127

continue;

1128

1129

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1130

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1131

}

1132

setF16Action(MVT::v8f16, Expand);

1133

setOperationAction(ISD::FADD, MVT::v8f16, Expand);

1134

setOperationAction(ISD::FSUB, MVT::v8f16, Expand);

1135

setOperationAction(ISD::FMUL, MVT::v8f16, Expand);

1136

setOperationAction(ISD::FDIV, MVT::v8f16, Expand);

1137

1138

// Custom lower v2i64 and v2f64 selects.

1139

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1140

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1141

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

1142

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

1143

setOperationAction(ISD::SELECT, MVT::v8f16, Custom);

1144

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

1145

1146

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);

1147

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);

1148

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

1149

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1150

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);

1151

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

1152

1153

// Custom legalize these to avoid over promotion or custom promotion.

1154

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

1155

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1156

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1157

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1158

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1159

}

1160

1161

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);

1162

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);

1163

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

1164

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

1165

1166

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

1167

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

1168

1169

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

1170

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

1171

1172

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

1173

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1174

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1175

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1176

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1177

1178

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1179

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1180

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1181

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1182

1183

// We want to legalize this to an f64 load rather than an i64 load on

1184

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1185

// store.

1186

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1187

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1188

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1189

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1190

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1191

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1192

1193

// Add 32-bit vector stores to help vectorization opportunities.

1194

setOperationAction(ISD::STORE, MVT::v2i16, Custom);

1195

setOperationAction(ISD::STORE, MVT::v4i8, Custom);

1196

1197

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1198

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1199

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1200

if (!Subtarget.hasAVX512())

1201

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1202

1203

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1204

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1205

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1206

1207

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1208

1209

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1210

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1211

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1212

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1213

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1214

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1215

1216

// In the customized shift lowering, the legal v4i32/v2i64 cases

1217

// in AVX2 will be recognized.

1218

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1219

setOperationAction(ISD::SRL, VT, Custom);

1220

setOperationAction(ISD::SHL, VT, Custom);

1221

setOperationAction(ISD::SRA, VT, Custom);

1222

if (VT == MVT::v2i64) continue;

1223

setOperationAction(ISD::ROTL, VT, Custom);

1224

setOperationAction(ISD::ROTR, VT, Custom);

1225

setOperationAction(ISD::FSHL, VT, Custom);

1226

setOperationAction(ISD::FSHR, VT, Custom);

1227

}

1228

1229

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1230

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1231

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1232

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1233

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1234

}

1235

1236

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1237

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1238

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1239

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1240

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1241

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1242

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1243

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1244

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1245

1246

// These might be better off as horizontal vector ops.

1247

setOperationAction(ISD::ADD, MVT::i16, Custom);

1248

setOperationAction(ISD::ADD, MVT::i32, Custom);

1249

setOperationAction(ISD::SUB, MVT::i16, Custom);

1250

setOperationAction(ISD::SUB, MVT::i32, Custom);

1251

}

1252

1253

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1254

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1255

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1256

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1257

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1258

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1259

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1260

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1261

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1262

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1263

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1264

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1265

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1266

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1267

1268

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1269

}

1270

1271

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1272

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1273

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1274

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1275

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1276

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1277

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1278

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1279

1280

for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

1281

setOperationAction(ISD::ABDS, VT, Custom);

1282

setOperationAction(ISD::ABDU, VT, Custom);

1283

}

1284

1285

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

1286

setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);

1287

setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);

1288

1289

// FIXME: Do we need to handle scalar-to-vector here?

1290

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1291

setOperationAction(ISD::SMULO, MVT::v2i32, Custom);

1292

1293

// We directly match byte blends in the backend as they match the VSELECT

1294

// condition form.

1295

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1296

1297

// SSE41 brings specific instructions for doing vector sign extend even in

1298

// cases where we don't have SRA.

1299

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1300

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1301

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1302

}

1303

1304

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1305

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1306

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1307

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1308

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1309

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1310

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1311

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1312

}

1313

1314

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1315

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1316

// do the pre and post work in the vector domain.

1317

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1318

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1319

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1320

// so that DAG combine doesn't try to turn it into uint_to_fp.

1321

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1322

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1323

}

1324

}

1325

1326

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

1327

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

1328

}

1329

1330

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1331

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1332

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1333

setOperationAction(ISD::ROTL, VT, Custom);

1334

setOperationAction(ISD::ROTR, VT, Custom);

1335

}

1336

1337

// XOP can efficiently perform BITREVERSE with VPPERM.

1338

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1339

setOperationAction(ISD::BITREVERSE, VT, Custom);

1340

1341

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1342

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1343

setOperationAction(ISD::BITREVERSE, VT, Custom);

1344

}

1345

1346

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1347

bool HasInt256 = Subtarget.hasInt256();

1348

1349

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1350

: &X86::VR256RegClass);

1351

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1352

: &X86::VR256RegClass);

1353

addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1354

: &X86::VR256RegClass);

1355

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1356

: &X86::VR256RegClass);

1357

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1358

: &X86::VR256RegClass);

1359

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1360

: &X86::VR256RegClass);

1361

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1362

: &X86::VR256RegClass);

1363

1364

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1365

setOperationAction(ISD::FFLOOR, VT, Legal);

1366

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1367

setOperationAction(ISD::FCEIL, VT, Legal);

1368

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1369

setOperationAction(ISD::FTRUNC, VT, Legal);

1370

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1371

setOperationAction(ISD::FRINT, VT, Legal);

1372

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1373

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1374

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1375

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1376

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1377

1378

setOperationAction(ISD::FROUND, VT, Custom);

1379

1380

setOperationAction(ISD::FNEG, VT, Custom);

1381

setOperationAction(ISD::FABS, VT, Custom);

1382

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1383

}

1384

1385

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1386

// even though v8i16 is a legal type.

1387

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1388

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1389

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1390

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1391

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);

1392

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);

1393

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);

1394

1395

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);

1396

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);

1397

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);

1398

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);

1399

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);

1400

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);

1401

1402

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1403

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1404

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1405

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1406

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1407

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1408

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1409

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1410

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1411

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1412

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1413

1414

if (!Subtarget.hasAVX512())

1415

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1416

1417

// In the customized shift lowering, the legal v8i32/v4i64 cases

1418

// in AVX2 will be recognized.

1419

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1420

setOperationAction(ISD::SRL, VT, Custom);

1421

setOperationAction(ISD::SHL, VT, Custom);

1422

setOperationAction(ISD::SRA, VT, Custom);

1423

setOperationAction(ISD::ABDS, VT, Custom);

1424

setOperationAction(ISD::ABDU, VT, Custom);

1425

if (VT == MVT::v4i64) continue;

1426

setOperationAction(ISD::ROTL, VT, Custom);

1427

setOperationAction(ISD::ROTR, VT, Custom);

1428

setOperationAction(ISD::FSHL, VT, Custom);

1429

setOperationAction(ISD::FSHR, VT, Custom);

1430

}

1431

1432

// These types need custom splitting if their input is a 128-bit vector.

1433

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1434

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1435

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1436

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1437

1438

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1439

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1440

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1441

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1442

setOperationAction(ISD::SELECT, MVT::v16f16, Custom);

1443

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1444

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1445

1446

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1447

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1448

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1449

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1450

}

1451

1452

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1453

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1454

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1455

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1456

1457

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1458

setOperationAction(ISD::SETCC, VT, Custom);

1459

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1460

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1461

setOperationAction(ISD::CTPOP, VT, Custom);

1462

setOperationAction(ISD::CTLZ, VT, Custom);

1463

1464

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1465

// setcc all the way to isel and prefer SETGT in some isel patterns.

1466

setCondCodeAction(ISD::SETLT, VT, Custom);

1467

setCondCodeAction(ISD::SETLE, VT, Custom);

1468

}

1469

1470

if (Subtarget.hasAnyFMA()) {

1471

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1472

MVT::v2f64, MVT::v4f64 }) {

1473

setOperationAction(ISD::FMA, VT, Legal);

1474

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1475

}

1476

}

1477

1478

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1479

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1480

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1481

}

1482

1483

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1484

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1485

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1486

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1487

1488

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1489

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1490

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1491

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1492

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1493

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1494

setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);

1495

setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);

1496

1497

setOperationAction(ISD::SMULO, MVT::v32i8, Custom);

1498

setOperationAction(ISD::UMULO, MVT::v32i8, Custom);

1499

1500

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1501

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1502

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1503

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1504

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1505

1506

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1507

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1508

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1509

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1510

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1511

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1512

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1513

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1514

setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);

1515

setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);

1516

setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);

1517

setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);

1518

1519

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1520

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1521

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1522

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1523

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1524

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1525

}

1526

1527

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1528

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1529

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1530

}

1531

1532

if (HasInt256) {

1533

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1534

// when we have a 256bit-wide blend with immediate.

1535

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1536

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1537

1538

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1539

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1540

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1541

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1542

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1543

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1544

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1545

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1546

}

1547

}

1548

1549

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1550

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1551

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1552

setOperationAction(ISD::MSTORE, VT, Legal);

1553

}

1554

1555

// Extract subvector is special because the value type

1556

// (result) is 128-bit but the source is 256-bit wide.

1557

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1558

MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

1559

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1560

}

1561

1562

// Custom lower several nodes for 256-bit types.

1563

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1564

MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {

1565

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1566

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1567

setOperationAction(ISD::VSELECT, VT, Custom);

1568

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1569

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1570

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1571

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1572

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1573

setOperationAction(ISD::STORE, VT, Custom);

1574

}

1575

setF16Action(MVT::v16f16, Expand);

1576

setOperationAction(ISD::FADD, MVT::v16f16, Expand);

1577

setOperationAction(ISD::FSUB, MVT::v16f16, Expand);

1578

setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

1579

setOperationAction(ISD::FDIV, MVT::v16f16, Expand);

1580

1581

if (HasInt256) {

1582

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1583

1584

// Custom legalize 2x32 to get a little better code.

1585

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1586

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1587

1588

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1589

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1590

setOperationAction(ISD::MGATHER, VT, Custom);

1591

}

1592

}

1593

1594

if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&

1595

Subtarget.hasF16C()) {

1596

for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {

1597

setOperationAction(ISD::FP_ROUND, VT, Custom);

1598

setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);

1599

}

1600

for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {

1601

setOperationAction(ISD::FP_EXTEND, VT, Custom);

1602

setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);

1603

}

1604

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1605

setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);

1606

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1607

}

1608

1609

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

1610

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

1611

}

1612

1613

// This block controls legalization of the mask vector sizes that are

1614

// available with AVX512. 512-bit vectors are in a separate block controlled

1615

// by useAVX512Regs.

1616

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1617

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1618

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1619

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1620

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1621

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1622

1623

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1624

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1625

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1626

1627

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1628

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1629

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1630

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1631

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1632

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1633

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1634

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1635

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1636

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1637

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1638

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1639

1640

// There is no byte sized k-register load or store without AVX512DQ.

1641

if (!Subtarget.hasDQI()) {

1642

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1643

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1644

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1645

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1646

1647

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1648

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1649

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1650

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1651

}

1652

1653

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1654

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1655

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1656

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1657

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1658

}

1659

1660

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

1661

setOperationAction(ISD::VSELECT, VT, Expand);

1662

1663

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1664

setOperationAction(ISD::SETCC, VT, Custom);

1665

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1666

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1667

setOperationAction(ISD::SELECT, VT, Custom);

1668

setOperationAction(ISD::TRUNCATE, VT, Custom);

1669

1670

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1671

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1672

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1673

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1674

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1675

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1676

}

1677

1678

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1679

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1680

}

1681

1682

// This block controls legalization for 512-bit operations with 32/64 bit

1683

// elements. 512-bits can be disabled based on prefer-vector-width and

1684

// required-vector-width function attributes.

1685

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1686

bool HasBWI = Subtarget.hasBWI();

1687

1688

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1689

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1690

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1691

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1692

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1693

addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

1694

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1695

1696

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1697

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1698

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1699

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1700

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1701

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1702

if (HasBWI)

1703

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1704

}

1705

1706

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1707

setOperationAction(ISD::FNEG, VT, Custom);

1708

setOperationAction(ISD::FABS, VT, Custom);

1709

setOperationAction(ISD::FMA, VT, Legal);

1710

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1711

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1712

}

1713

1714

for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {

1715

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1716

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1717

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1718

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1719

}

1720

1721

for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {

1722

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1723

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1724

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1725

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1726

}

1727

1728

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);

1729

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);

1730

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);

1731

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);

1732

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);

1733

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);

1734

1735

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1736

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1737

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1738

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1739

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1740

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1741

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1742

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1743

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1744

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1745

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1746

1747

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1748

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1749

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1750

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1751

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1752

if (HasBWI)

1753

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1754

1755

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1756

// to 512-bit rather than use the AVX2 instructions so that we can use

1757

// k-masks.

1758

if (!Subtarget.hasVLX()) {

1759

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1760

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1761

setOperationAction(ISD::MLOAD, VT, Custom);

1762

setOperationAction(ISD::MSTORE, VT, Custom);

1763

}

1764

}

1765

1766

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1767

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1768

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1769

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1770

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1771

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1772

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1773

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1774

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1775

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1776

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1777

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1778

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1779

1780

if (HasBWI) {

1781

// Extends from v64i1 masks to 512-bit vectors.

1782

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1783

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1784

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1785

}

1786

1787

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1788

setOperationAction(ISD::FFLOOR, VT, Legal);

1789

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1790

setOperationAction(ISD::FCEIL, VT, Legal);

1791

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1792

setOperationAction(ISD::FTRUNC, VT, Legal);

1793

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1794

setOperationAction(ISD::FRINT, VT, Legal);

1795

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1796

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1797

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1798

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1799

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1800

1801

setOperationAction(ISD::FROUND, VT, Custom);

1802

}

1803

1804

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1805

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1806

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1807

}

1808

1809

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1810

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1811

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1812

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1813

1814

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1815

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1816

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1817

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1818

1819

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1820

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1821

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1822

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1823

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1824

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1825

setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

1826

setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);

1827

1828

setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

1829

setOperationAction(ISD::UMULO, MVT::v64i8, Custom);

1830

1831

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1832

1833

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1834

setOperationAction(ISD::SRL, VT, Custom);

1835

setOperationAction(ISD::SHL, VT, Custom);

1836

setOperationAction(ISD::SRA, VT, Custom);

1837

setOperationAction(ISD::ROTL, VT, Custom);

1838

setOperationAction(ISD::ROTR, VT, Custom);

1839

setOperationAction(ISD::SETCC, VT, Custom);

1840

setOperationAction(ISD::ABDS, VT, Custom);

1841

setOperationAction(ISD::ABDU, VT, Custom);

1842

1843

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1844

// setcc all the way to isel and prefer SETGT in some isel patterns.

1845

setCondCodeAction(ISD::SETLT, VT, Custom);

1846

setCondCodeAction(ISD::SETLE, VT, Custom);

1847

}

1848

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1849

setOperationAction(ISD::SMAX, VT, Legal);

1850

setOperationAction(ISD::UMAX, VT, Legal);

1851

setOperationAction(ISD::SMIN, VT, Legal);

1852

setOperationAction(ISD::UMIN, VT, Legal);

1853

setOperationAction(ISD::ABS, VT, Legal);

1854

setOperationAction(ISD::CTPOP, VT, Custom);

1855

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1856

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1857

}

1858

1859

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1860

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1861

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1862

setOperationAction(ISD::CTLZ, VT, Custom);

1863

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1864

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1865

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1866

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1867

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1868

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1869

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1870

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1871

}

1872

1873

setOperationAction(ISD::FSHL, MVT::v64i8, Custom);

1874

setOperationAction(ISD::FSHR, MVT::v64i8, Custom);

1875

setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

1876

setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

1877

setOperationAction(ISD::FSHL, MVT::v16i32, Custom);

1878

setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

1879

1880

if (Subtarget.hasDQI()) {

1881

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

1882

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

1883

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

1884

setOperationAction(Opc, MVT::v8i64, Custom);

1885

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1886

}

1887

1888

if (Subtarget.hasCDI()) {

1889

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1890

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1891

setOperationAction(ISD::CTLZ, VT, Legal);

1892

}

1893

} // Subtarget.hasCDI()

1894

1895

if (Subtarget.hasVPOPCNTDQ()) {

1896

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1897

setOperationAction(ISD::CTPOP, VT, Legal);

1898

}

1899

1900

// Extract subvector is special because the value type

1901

// (result) is 256-bit but the source is 512-bit wide.

1902

// 128-bit was made Legal under AVX1.

1903

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1904

MVT::v16f16, MVT::v8f32, MVT::v4f64 })

1905

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1906

1907

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1908

MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {

1909

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1910

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1911

setOperationAction(ISD::SELECT, VT, Custom);

1912

setOperationAction(ISD::VSELECT, VT, Custom);

1913

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1914

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1915

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1916

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1917

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1918

}

1919

setF16Action(MVT::v32f16, Expand);

1920

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);

1921

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);

1922

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

1923

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

1924

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1925

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1926

setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);

1927

}

1928

1929

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1930

setOperationAction(ISD::MLOAD, VT, Legal);

1931

setOperationAction(ISD::MSTORE, VT, Legal);

1932

setOperationAction(ISD::MGATHER, VT, Custom);

1933

setOperationAction(ISD::MSCATTER, VT, Custom);

1934

}

1935

if (HasBWI) {

1936

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1937

setOperationAction(ISD::MLOAD, VT, Legal);

1938

setOperationAction(ISD::MSTORE, VT, Legal);

1939

}

1940

} else {

1941

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1942

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1943

}

1944

1945

if (Subtarget.hasVBMI2()) {

1946

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1947

MVT::v16i16, MVT::v8i32, MVT::v4i64,

1948

MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1949

setOperationAction(ISD::FSHL, VT, Custom);

1950

setOperationAction(ISD::FSHR, VT, Custom);

1951

}

1952

1953

setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

1954

setOperationAction(ISD::ROTR, MVT::v8i16, Custom);

1955

setOperationAction(ISD::ROTR, MVT::v16i16, Custom);

1956

setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

1957

}

1958

}// useAVX512Regs

1959

1960

// This block controls legalization for operations that don't have

1961

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1962

// narrower widths.

1963

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1964

// These operations are handled on non-VLX by artificially widening in

1965

// isel patterns.

1966

1967

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);

1968

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);

1969

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1970

1971

if (Subtarget.hasDQI()) {

1972

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1973

// v2f32 UINT_TO_FP is already custom under SSE2.

1974

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))

1975

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))

1976

"Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__));

1977

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1978

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1979

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1980

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1981

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1982

}

1983

1984

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1985

setOperationAction(ISD::SMAX, VT, Legal);

1986

setOperationAction(ISD::UMAX, VT, Legal);

1987

setOperationAction(ISD::SMIN, VT, Legal);

1988

setOperationAction(ISD::UMIN, VT, Legal);

1989

setOperationAction(ISD::ABS, VT, Legal);

1990

}

1991

1992

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1993

setOperationAction(ISD::ROTL, VT, Custom);

1994

setOperationAction(ISD::ROTR, VT, Custom);

1995

}

1996

1997

// Custom legalize 2x32 to get a little better code.

1998

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

1999

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

2000

2001

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

2002

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

2003

setOperationAction(ISD::MSCATTER, VT, Custom);

2004

2005

if (Subtarget.hasDQI()) {

2006

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

2007

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

2008

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {

2009

setOperationAction(Opc, MVT::v2i64, Custom);

2010

setOperationAction(Opc, MVT::v4i64, Custom);

2011

}

2012

setOperationAction(ISD::MUL, MVT::v2i64, Legal);

2013

setOperationAction(ISD::MUL, MVT::v4i64, Legal);

2014

}

2015

2016

if (Subtarget.hasCDI()) {

2017

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

2018

setOperationAction(ISD::CTLZ, VT, Legal);

2019

}

2020

} // Subtarget.hasCDI()

2021

2022

if (Subtarget.hasVPOPCNTDQ()) {

2023

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

2024

setOperationAction(ISD::CTPOP, VT, Legal);

2025

}

2026

}

2027

2028

// This block control legalization of v32i1/v64i1 which are available with

2029

// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with

2030

// useBWIRegs.

2031

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

2032

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

2033

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

2034

2035

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

2036

setOperationAction(ISD::VSELECT, VT, Expand);

2037

setOperationAction(ISD::TRUNCATE, VT, Custom);

2038

setOperationAction(ISD::SETCC, VT, Custom);

2039

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2040

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

2041

setOperationAction(ISD::SELECT, VT, Custom);

2042

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2043

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2044

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

2045

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

2046

}

2047

2048

for (auto VT : { MVT::v16i1, MVT::v32i1 })

2049

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

2050

2051

// Extends from v32i1 masks to 256-bit vectors.

2052

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

2053

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

2054

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

2055

2056

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

2057

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

2058

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

2059

}

2060

2061

// These operations are handled on non-VLX by artificially widening in

2062

// isel patterns.

2063

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

2064

2065

if (Subtarget.hasBITALG()) {

2066

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

2067

setOperationAction(ISD::CTPOP, VT, Legal);

2068

}

2069

}

2070

2071

if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

2072

auto setGroup = [&] (MVT VT) {

2073

setOperationAction(ISD::FADD, VT, Legal);

2074

setOperationAction(ISD::STRICT_FADD, VT, Legal);

2075

setOperationAction(ISD::FSUB, VT, Legal);

2076

setOperationAction(ISD::STRICT_FSUB, VT, Legal);

2077

setOperationAction(ISD::FMUL, VT, Legal);

2078

setOperationAction(ISD::STRICT_FMUL, VT, Legal);

2079

setOperationAction(ISD::FDIV, VT, Legal);

2080

setOperationAction(ISD::STRICT_FDIV, VT, Legal);

2081

setOperationAction(ISD::FSQRT, VT, Legal);

2082

setOperationAction(ISD::STRICT_FSQRT, VT, Legal);

2083

2084

setOperationAction(ISD::FFLOOR, VT, Legal);

2085

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

2086

setOperationAction(ISD::FCEIL, VT, Legal);

2087

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

2088

setOperationAction(ISD::FTRUNC, VT, Legal);

2089

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

2090

setOperationAction(ISD::FRINT, VT, Legal);

2091

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

2092

setOperationAction(ISD::FNEARBYINT, VT, Legal);

2093

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

2094

2095

setOperationAction(ISD::FROUND, VT, Custom);

2096

2097

setOperationAction(ISD::LOAD, VT, Legal);

2098

setOperationAction(ISD::STORE, VT, Legal);

2099

2100

setOperationAction(ISD::FMA, VT, Legal);

2101

setOperationAction(ISD::STRICT_FMA, VT, Legal);

2102

setOperationAction(ISD::VSELECT, VT, Legal);

2103

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2104

setOperationAction(ISD::SELECT, VT, Custom);

2105

2106

setOperationAction(ISD::FNEG, VT, Custom);

2107

setOperationAction(ISD::FABS, VT, Custom);

2108

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

2109

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2110

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2111

};

2112

2113

// AVX512_FP16 scalar operations

2114

setGroup(MVT::f16);

2115

setOperationAction(ISD::FREM, MVT::f16, Promote);

2116

setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);

2117

setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);

2118

setOperationAction(ISD::BR_CC, MVT::f16, Expand);

2119

setOperationAction(ISD::SETCC, MVT::f16, Custom);

2120

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

2121

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

2122

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

2123

setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);

2124

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);

2125

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

2126

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

2127

setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);

2128

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

2129

2130

setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

2131

setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);

2132

2133

if (Subtarget.useAVX512Regs()) {

2134

setGroup(MVT::v32f16);

2135

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);

2136

setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);

2137

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);

2138

setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);

2139

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);

2140

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);

2141

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);

2142

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

2143

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

2144

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);

2145

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

2146

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);

2147

2148

setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);

2149

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);

2150

setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);

2151

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);

2152

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);

2153

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

2154

MVT::v32i16);

2155

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);

2156

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

2157

MVT::v32i16);

2158

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);

2159

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

2160

MVT::v32i16);

2161

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);

2162

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

2163

MVT::v32i16);

2164

2165

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);

2166

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);

2167

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);

2168

2169

setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);

2170

setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);

2171

2172

setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);

2173

setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);

2174

}

2175

2176

if (Subtarget.hasVLX()) {

2177

setGroup(MVT::v8f16);

2178

setGroup(MVT::v16f16);

2179

2180

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);

2181

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);

2182

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);

2183

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);

2184

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);

2185

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);

2186

setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);

2187

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);

2188

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);

2189

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);

2190

2191

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

2192

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);

2193

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);

2194

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);

2195

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);

2196

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);

2197

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

2198

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

2199

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

2200

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

2201

2202

// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

2203

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);

2204

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);

2205

2206

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);

2207

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);

2208

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);

2209

2210

setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

2211

setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

2212

setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

2213

setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);

2214

2215

// Need to custom widen these to prevent scalarization.

2216

setOperationAction(ISD::LOAD, MVT::v4f16, Custom);

2217

setOperationAction(ISD::STORE, MVT::v4f16, Custom);

2218

}

2219

}

2220

2221

if (!Subtarget.useSoftFloat() &&

2222

(Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {

2223

addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);

2224

addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);

2225

// We set the type action of bf16 to TypeSoftPromoteHalf, but we don't

2226

// provide the method to promote BUILD_VECTOR. Set the operation action

2227

// Custom to do the customization later.

2228

setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);

2229

for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

2230

setF16Action(VT, Expand);

2231

setOperationAction(ISD::FADD, VT, Expand);

2232

setOperationAction(ISD::FSUB, VT, Expand);

2233

setOperationAction(ISD::FMUL, VT, Expand);

2234

setOperationAction(ISD::FDIV, VT, Expand);

2235

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2236

}

2237

addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));

2238

}

2239

2240

if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {

2241

addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);

2242

setF16Action(MVT::v32bf16, Expand);

2243

setOperationAction(ISD::FADD, MVT::v32bf16, Expand);

2244

setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);

2245

setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);

2246

setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);

2247

setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);

2248

}

2249

2250

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

2251

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

2252

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

2253

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

2254

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

2255

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

2256

2257

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

2258

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

2259

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

2260

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

2261

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

2262

2263

if (Subtarget.hasBWI()) {

2264

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

2265

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

2266

}

2267

2268

if (Subtarget.hasFP16()) {

2269

// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

2270

setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);

2271

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

2272

setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);

2273

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

2274

setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);

2275

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

2276

setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);

2277

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

2278

// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

2279

setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);

2280

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

2281

setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);

2282

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

2283

setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);

2284

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

2285

setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);

2286

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

2287

// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

2288

setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);

2289

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);

2290

setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

2291

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);

2292

// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

2293

setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);

2294

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);

2295

setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);

2296

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);

2297

}

2298

2299

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

2300

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

2301

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

2302

}

2303

2304

if (Subtarget.hasAMXTILE()) {

2305

addRegisterClass(MVT::x86amx, &X86::TILERegClass);

2306

}

2307

2308

// We want to custom lower some of our intrinsics.

2309

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

2310

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

2311

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

2312

if (!Subtarget.is64Bit()) {

2313

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

2314

}

2315

2316

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

2317

// handle type legalization for these operations here.

2318

//

2319

// FIXME: We really should do custom legalization for addition and

2320

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

2321

// than generic legalization for 64-bit multiplication-with-overflow, though.

2322

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

2323

if (VT == MVT::i64 && !Subtarget.is64Bit())

2324

continue;

2325

// Add/Sub/Mul with overflow operations are custom lowered.

2326

setOperationAction(ISD::SADDO, VT, Custom);

2327

setOperationAction(ISD::UADDO, VT, Custom);

2328

setOperationAction(ISD::SSUBO, VT, Custom);

2329

setOperationAction(ISD::USUBO, VT, Custom);

2330

setOperationAction(ISD::SMULO, VT, Custom);

2331

setOperationAction(ISD::UMULO, VT, Custom);

2332

2333

// Support carry in as value rather than glue.

2334

setOperationAction(ISD::ADDCARRY, VT, Custom);

2335

setOperationAction(ISD::SUBCARRY, VT, Custom);

2336

setOperationAction(ISD::SETCCCARRY, VT, Custom);

2337

setOperationAction(ISD::SADDO_CARRY, VT, Custom);

2338

setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

2339

}

2340

2341

if (!Subtarget.is64Bit()) {

2342

// These libcalls are not available in 32-bit.

2343

setLibcallName(RTLIB::SHL_I128, nullptr);

2344

setLibcallName(RTLIB::SRL_I128, nullptr);

2345

setLibcallName(RTLIB::SRA_I128, nullptr);

2346

setLibcallName(RTLIB::MUL_I128, nullptr);

2347

// The MULO libcall is not part of libgcc, only compiler-rt.

2348

setLibcallName(RTLIB::MULO_I64, nullptr);

2349

}

2350

// The MULO libcall is not part of libgcc, only compiler-rt.

2351

setLibcallName(RTLIB::MULO_I128, nullptr);

2352

2353

// Combine sin / cos into _sincos_stret if it is available.

2354

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

2355

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

2356

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

2357

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

2358

}

2359

2360

if (Subtarget.isTargetWin64()) {

2361

setOperationAction(ISD::SDIV, MVT::i128, Custom);

2362

setOperationAction(ISD::UDIV, MVT::i128, Custom);

2363

setOperationAction(ISD::SREM, MVT::i128, Custom);

2364

setOperationAction(ISD::UREM, MVT::i128, Custom);

2365

setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

2366

setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

2367

setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

2368

setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

2369

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

2370

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

2371

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

2372

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

2373

}

2374

2375

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

2376

// is. We should promote the value to 64-bits to solve this.

2377

// This is what the CRT headers do - `fmodf` is an inline header

2378

// function casting to f64 and calling `fmod`.

2379

if (Subtarget.is32Bit() &&

2380

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

2381

for (ISD::NodeType Op :

2382

{ISD::FCEIL, ISD::STRICT_FCEIL,

2383

ISD::FCOS, ISD::STRICT_FCOS,

2384

ISD::FEXP, ISD::STRICT_FEXP,

2385

ISD::FFLOOR, ISD::STRICT_FFLOOR,

2386

ISD::FREM, ISD::STRICT_FREM,

2387

ISD::FLOG, ISD::STRICT_FLOG,

2388

ISD::FLOG10, ISD::STRICT_FLOG10,

2389

ISD::FPOW, ISD::STRICT_FPOW,

2390

ISD::FSIN, ISD::STRICT_FSIN})

2391

if (isOperationExpand(Op, MVT::f32))

2392

setOperationAction(Op, MVT::f32, Promote);

2393

2394

// We have target-specific dag combine patterns for the following nodes:

2395

setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

2396

ISD::SCALAR_TO_VECTOR,

2397

ISD::INSERT_VECTOR_ELT,

2398

ISD::EXTRACT_VECTOR_ELT,

2399

ISD::CONCAT_VECTORS,

2400

ISD::INSERT_SUBVECTOR,

2401

ISD::EXTRACT_SUBVECTOR,

2402

ISD::BITCAST,

2403

ISD::VSELECT,

2404

ISD::SELECT,

2405

ISD::SHL,

2406

ISD::SRA,

2407

ISD::SRL,

2408

ISD::OR,

2409

ISD::AND,

2410

ISD::ADD,

2411

ISD::FADD,

2412

ISD::FSUB,

2413

ISD::FNEG,

2414

ISD::FMA,

2415

ISD::STRICT_FMA,

2416

ISD::FMINNUM,

2417

ISD::FMAXNUM,

2418

ISD::SUB,

2419

ISD::LOAD,

2420

ISD::MLOAD,

2421

ISD::STORE,

2422

ISD::MSTORE,

2423

ISD::TRUNCATE,

2424

ISD::ZERO_EXTEND,

2425

ISD::ANY_EXTEND,

2426

ISD::SIGN_EXTEND,

2427

ISD::SIGN_EXTEND_INREG,

2428

ISD::ANY_EXTEND_VECTOR_INREG,

2429

ISD::SIGN_EXTEND_VECTOR_INREG,

2430

ISD::ZERO_EXTEND_VECTOR_INREG,

2431

ISD::SINT_TO_FP,

2432

ISD::UINT_TO_FP,

2433

ISD::STRICT_SINT_TO_FP,

2434

ISD::STRICT_UINT_TO_FP,

2435

ISD::SETCC,

2436

ISD::MUL,

2437

ISD::XOR,

2438

ISD::MSCATTER,

2439

ISD::MGATHER,

2440

ISD::FP16_TO_FP,

2441

ISD::FP_EXTEND,

2442

ISD::STRICT_FP_EXTEND,

2443

ISD::FP_ROUND,

2444

ISD::STRICT_FP_ROUND});

2445

2446

computeRegisterProperties(Subtarget.getRegisterInfo());

2447

2448

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2449

MaxStoresPerMemsetOptSize = 8;

2450

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2451

MaxStoresPerMemcpyOptSize = 4;

2452

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2453

MaxStoresPerMemmoveOptSize = 4;

2454

2455

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2456

// that needs to benchmarked and balanced with the potential use of vector

2457

// load/store types (PR33329, PR33914).

2458

MaxLoadsPerMemcmp = 2;

2459

MaxLoadsPerMemcmpOptSize = 2;

2460

2461

// Default loop alignment, which can be overridden by -align-loops.

2462

setPrefLoopAlignment(Align(16));

2463

2464

// An out-of-order CPU can speculatively execute past a predictable branch,

2465

// but a conditional move could be stalled by an expensive earlier operation.

2466

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2467

EnableExtLdPromotion = true;

2468

setPrefFunctionAlignment(Align(16));

2469

2470

verifyIntrinsicTables();

2471

2472

// Default to having -disable-strictnode-mutation on

2473

IsStrictFPEnabled = true;

2474

}

2475

2476

// This has so far only been implemented for 64-bit MachO.

2477

bool X86TargetLowering::useLoadStackGuardNode() const {

2478

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2479

}

2480

2481

bool X86TargetLowering::useStackGuardXorFP() const {

2482

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2483

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2484

}

2485

2486

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2487

const SDLoc &DL) const {

2488

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2489

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2490

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2491

return SDValue(Node, 0);

2492

}

2493

2494

TargetLoweringBase::LegalizeTypeAction

2495

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2496

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2497

!Subtarget.hasBWI())

2498

return TypeSplitVector;

2499

2500

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2501

!Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)

2502

return TypeSplitVector;

2503

2504

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2505

VT.getVectorElementType() != MVT::i1)

2506

return TypeWidenVector;

2507

2508

return TargetLoweringBase::getPreferredVectorAction(VT);

2509

}

2510

2511

static std::pair<MVT, unsigned>

2512

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2513

const X86Subtarget &Subtarget) {

2514

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2515

// convention is one that uses k registers.

2516

if (NumElts == 2)

2517

return {MVT::v2i64, 1};

2518

if (NumElts == 4)

2519

return {MVT::v4i32, 1};

2520

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2521

CC != CallingConv::Intel_OCL_BI)

2522

return {MVT::v8i16, 1};

2523

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2524

CC != CallingConv::Intel_OCL_BI)

2525

return {MVT::v16i8, 1};

2526

// v32i1 passes in ymm unless we have BWI and the calling convention is

2527

// regcall.

2528

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2529

return {MVT::v32i8, 1};

2530

// Split v64i1 vectors if we don't have v64i8 available.

2531

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2532

if (Subtarget.useAVX512Regs())

2533

return {MVT::v64i8, 1};

2534

return {MVT::v32i8, 2};

2535

}

2536

2537

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2538

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2539

NumElts > 64)

2540

return {MVT::i8, NumElts};

2541

2542

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2543

}

2544

2545

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2546

CallingConv::ID CC,

2547

EVT VT) const {

2548

if (VT.isVector()) {

2549

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2550

unsigned NumElts = VT.getVectorNumElements();

2551

2552

MVT RegisterVT;

2553

unsigned NumRegisters;

2554

std::tie(RegisterVT, NumRegisters) =

2555

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2556

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2557

return RegisterVT;

2558

}

2559

2560

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2561

return MVT::v8f16;

2562

}

2563

2564

// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.

2565

if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&

2566

!Subtarget.hasX87())

2567

return MVT::i32;

2568

2569

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2570

return getRegisterTypeForCallingConv(Context, CC,

2571

VT.changeVectorElementTypeToInteger());

2572

2573

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2574

}

2575

2576

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2577

CallingConv::ID CC,

2578

EVT VT) const {

2579

if (VT.isVector()) {

2580

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2581

unsigned NumElts = VT.getVectorNumElements();

2582

2583

MVT RegisterVT;

2584

unsigned NumRegisters;

2585

std::tie(RegisterVT, NumRegisters) =

2586

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2587

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2588

return NumRegisters;

2589

}

2590

2591

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2592

return 1;

2593

}

2594

2595

// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if

2596

// x87 is disabled.

2597

if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {

2598

if (VT == MVT::f64)

2599

return 2;

2600

if (VT == MVT::f80)

2601

return 3;

2602

}

2603

2604

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2605

return getNumRegistersForCallingConv(Context, CC,

2606

VT.changeVectorElementTypeToInteger());

2607

2608

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2609

}

2610

2611

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2612

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2613

unsigned &NumIntermediates, MVT &RegisterVT) const {

2614

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2615

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2616

Subtarget.hasAVX512() &&

2617

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2618

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2619

VT.getVectorNumElements() > 64)) {

2620

RegisterVT = MVT::i8;

2621

IntermediateVT = MVT::i1;

2622

NumIntermediates = VT.getVectorNumElements();

2623

return NumIntermediates;

2624

}

2625

2626

// Split v64i1 vectors if we don't have v64i8 available.

2627

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2628

CC != CallingConv::X86_RegCall) {

2629

RegisterVT = MVT::v32i8;

2630

IntermediateVT = MVT::v32i1;

2631

NumIntermediates = 2;

2632

return 2;

2633

}

2634

2635

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2636

NumIntermediates, RegisterVT);

2637

}

2638

2639

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2640

LLVMContext& Context,

2641

EVT VT) const {

2642

if (!VT.isVector())

2643

return MVT::i8;

2644

2645

if (Subtarget.hasAVX512()) {

2646

// Figure out what this type will be legalized to.

2647

EVT LegalVT = VT;

2648

while (getTypeAction(Context, LegalVT) != TypeLegal)

2649

LegalVT = getTypeToTransformTo(Context, LegalVT);

2650

2651

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2652

if (LegalVT.getSimpleVT().is512BitVector())

2653

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2654

2655

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2656

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2657

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2658

// vXi16/vXi8.

2659

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2660

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2661

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2662

}

2663

}

2664

2665

return VT.changeVectorElementTypeToInteger();

2666

}

2667

2668

/// Helper for getByValTypeAlignment to determine

2669

/// the desired ByVal argument alignment.

2670

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2671

if (MaxAlign == 16)

2672

return;

2673

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2674

if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)

2675

MaxAlign = Align(16);

2676

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2677

Align EltAlign;

2678

getMaxByValAlign(ATy->getElementType(), EltAlign);

2679

if (EltAlign > MaxAlign)

2680

MaxAlign = EltAlign;

2681

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2682

for (auto *EltTy : STy->elements()) {

2683

Align EltAlign;

2684

getMaxByValAlign(EltTy, EltAlign);

2685

if (EltAlign > MaxAlign)

2686

MaxAlign = EltAlign;

2687

if (MaxAlign == 16)

2688

break;

2689

}

2690

}

2691

}

2692

2693

/// Return the desired alignment for ByVal aggregate

2694

/// function arguments in the caller parameter area. For X86, aggregates

2695

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2696

/// are at 4-byte boundaries.

2697

uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,

2698

const DataLayout &DL) const {

2699

if (Subtarget.is64Bit()) {

2700

// Max of 8 and alignment of type.

2701

Align TyAlign = DL.getABITypeAlign(Ty);

2702

if (TyAlign > 8)

2703

return TyAlign.value();

2704

return 8;

2705

}

2706

2707

Align Alignment(4);

2708

if (Subtarget.hasSSE1())

2709

getMaxByValAlign(Ty, Alignment);

2710

return Alignment.value();

2711

}

2712

2713

/// It returns EVT::Other if the type should be determined using generic

2714

/// target-independent logic.

2715

/// For vector ops we check that the overall size isn't larger than our

2716

/// preferred vector width.

2717

EVT X86TargetLowering::getOptimalMemOpType(

2718

const MemOp &Op, const AttributeList &FuncAttributes) const {

2719

if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {

2720

if (Op.size() >= 16 &&

2721

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2722

// FIXME: Check if unaligned 64-byte accesses are slow.

2723

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2724

(Subtarget.getPreferVectorWidth() >= 512)) {

2725

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2726

}

2727

// FIXME: Check if unaligned 32-byte accesses are slow.

2728

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2729

Subtarget.useLight256BitInstructions()) {

2730

// Although this isn't a well-supported type for AVX1, we'll let

2731

// legalization and shuffle lowering produce the optimal codegen. If we

2732

// choose an optimal type with a vector element larger than a byte,

2733

// getMemsetStores() may create an intermediate splat (using an integer

2734

// multiply) before we splat as a vector.

2735

return MVT::v32i8;

2736

}

2737

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2738

return MVT::v16i8;

2739

// TODO: Can SSE1 handle a byte vector?

2740

// If we have SSE1 registers we should be able to use them.

2741

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2742

(Subtarget.getPreferVectorWidth() >= 128))

2743

return MVT::v4f32;

2744

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2745

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2746

// Do not use f64 to lower memcpy if source is string constant. It's

2747

// better to use i32 to avoid the loads.

2748

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2749

// The gymnastics of splatting a byte value into an XMM register and then

2750

// only using 8-byte stores (because this is a CPU with slow unaligned

2751

// 16-byte accesses) makes that a loser.

2752

return MVT::f64;

2753

}

2754

}

2755

// This is a compromise. If we reach here, unaligned accesses may be slow on

2756

// this target. However, creating smaller, aligned accesses could be even

2757

// slower and would certainly be a lot more code.

2758

if (Subtarget.is64Bit() && Op.size() >= 8)

2759

return MVT::i64;

2760

return MVT::i32;

2761

}

2762

2763

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2764

if (VT == MVT::f32)

2765

return Subtarget.hasSSE1();

2766

if (VT == MVT::f64)

2767

return Subtarget.hasSSE2();

2768

return true;

2769

}

2770

2771

static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {

2772

return (8 * Alignment.value()) % SizeInBits == 0;

2773

}

2774

2775

bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {

2776

if (isBitAligned(Alignment, VT.getSizeInBits()))

2777

return true;

2778

switch (VT.getSizeInBits()) {

2779

default:

2780

// 8-byte and under are always assumed to be fast.

2781

return true;

2782

case 128:

2783

return !Subtarget.isUnalignedMem16Slow();

2784

case 256:

2785

return !Subtarget.isUnalignedMem32Slow();

2786

// TODO: What about AVX-512 (512-bit) accesses?

2787

}

2788

}

2789

2790

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2791

EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,

2792

unsigned *Fast) const {

2793

if (Fast)

2794

*Fast = isMemoryAccessFast(VT, Alignment);

2795

// NonTemporal vector memory ops must be aligned.

2796

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2797

// NT loads can only be vector aligned, so if its less aligned than the

2798

// minimum vector size (which we can split the vector down to), we might as

2799

// well use a regular unaligned vector load.

2800

// We don't have any NT loads pre-SSE41.

2801

if (!!(Flags & MachineMemOperand::MOLoad))

2802

return (Alignment < 16 || !Subtarget.hasSSE41());

2803

return false;

2804

}

2805

// Misaligned accesses of any size are always allowed.

2806

return true;

2807

}

2808

2809

bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,

2810

const DataLayout &DL, EVT VT,

2811

unsigned AddrSpace, Align Alignment,

2812

MachineMemOperand::Flags Flags,

2813

unsigned *Fast) const {

2814

if (Fast)

2815

*Fast = isMemoryAccessFast(VT, Alignment);

2816

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2817

if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,

2818

/*Fast=*/nullptr))

2819

return true;

2820

// NonTemporal vector memory ops are special, and must be aligned.

2821

if (!isBitAligned(Alignment, VT.getSizeInBits()))

2822

return false;

2823

switch (VT.getSizeInBits()) {

2824

case 128:

2825

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())

2826

return true;

2827

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())

2828

return true;

2829

return false;

2830

case 256:

2831

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())

2832

return true;

2833

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())

2834

return true;

2835

return false;

2836

case 512:

2837

if (Subtarget.hasAVX512())

2838

return true;

2839

return false;

2840

default:

2841

return false; // Don't have NonTemporal vector memory ops of this size.

2842

}

2843

}

2844

return true;

2845

}

2846

2847

/// Return the entry encoding for a jump table in the

2848

/// current function. The returned value is a member of the

2849

/// MachineJumpTableInfo::JTEntryKind enum.

2850

unsigned X86TargetLowering::getJumpTableEncoding() const {

2851

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2852

// symbol.

2853

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2854

return MachineJumpTableInfo::EK_Custom32;

2855

2856

// Otherwise, use the normal jump table encoding heuristics.

2857

return TargetLowering::getJumpTableEncoding();

2858

}

2859

2860

bool X86TargetLowering::splitValueIntoRegisterParts(

2861

SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,

2862

unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {

2863

bool IsABIRegCopy = CC.has_value();

2864

EVT ValueVT = Val.getValueType();

2865

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2866

unsigned ValueBits = ValueVT.getSizeInBits();

2867

unsigned PartBits = PartVT.getSizeInBits();

2868

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);

2869

Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);

2870

Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);

2871

Parts[0] = Val;

2872

return true;

2873

}

2874

return false;

2875

}

2876

2877

SDValue X86TargetLowering::joinRegisterPartsIntoValue(

2878

SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,

2879

MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {

2880

bool IsABIRegCopy = CC.has_value();

2881

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2882

unsigned ValueBits = ValueVT.getSizeInBits();

2883

unsigned PartBits = PartVT.getSizeInBits();

2884

SDValue Val = Parts[0];

2885

2886

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);

2887

Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);

2888

Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

2889

return Val;

2890

}

2891

return SDValue();

2892

}

2893

2894

bool X86TargetLowering::useSoftFloat() const {

2895

return Subtarget.useSoftFloat();

2896

}

2897

2898

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2899

ArgListTy &Args) const {

2900

2901

// Only relabel X86-32 for C / Stdcall CCs.

2902

if (Subtarget.is64Bit())

2903

return;

2904

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2905

return;

2906

unsigned ParamRegs = 0;

2907

if (auto *M = MF->getFunction().getParent())

2908

ParamRegs = M->getNumberRegisterParameters();

2909

2910

// Mark the first N int arguments as having reg

2911

for (auto &Arg : Args) {

2912

Type *T = Arg.Ty;

2913

if (T->isIntOrPtrTy())

2914

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2915

unsigned numRegs = 1;

2916

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2917

numRegs = 2;

2918

if (ParamRegs < numRegs)

2919

return;

2920

ParamRegs -= numRegs;

2921

Arg.IsInReg = true;

2922

}

2923

}

2924

}

2925

2926

const MCExpr *

2927

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2928

const MachineBasicBlock *MBB,

2929

unsigned uid,MCContext &Ctx) const{

2930

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2930, __extension__
__PRETTY_FUNCTION__));

2931

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2932

// entries.

2933

return MCSymbolRefExpr::create(MBB->getSymbol(),

2934

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2935

}

2936

2937

/// Returns relocation base for the given PIC jumptable.

2938

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2939

SelectionDAG &DAG) const {

2940

if (!Subtarget.is64Bit())

2941

// This doesn't have SDLoc associated with it, but is not really the

2942

// same as a Register.

2943

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2944

getPointerTy(DAG.getDataLayout()));

2945

return Table;

2946

}

2947

2948

/// This returns the relocation base for the given PIC jumptable,

2949

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2950

const MCExpr *X86TargetLowering::

2951

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2952

MCContext &Ctx) const {

2953

// X86-64 uses RIP relative addressing based on the jump table label.

2954

if (Subtarget.isPICStyleRIPRel())

2955

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2956

2957

// Otherwise, the reference is relative to the PIC base.

2958

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2959

}

2960

2961

std::pair<const TargetRegisterClass *, uint8_t>

2962

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2963

MVT VT) const {

2964

const TargetRegisterClass *RRC = nullptr;

2965

uint8_t Cost = 1;

2966

switch (VT.SimpleTy) {

2967

default:

2968

return TargetLowering::findRepresentativeClass(TRI, VT);

2969

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2970

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2971

break;

2972

case MVT::x86mmx:

2973

RRC = &X86::VR64RegClass;

2974

break;

2975

case MVT::f32: case MVT::f64:

2976

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2977

case MVT::v4f32: case MVT::v2f64:

2978

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2979

case MVT::v8f32: case MVT::v4f64:

2980

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2981

case MVT::v16f32: case MVT::v8f64:

2982

RRC = &X86::VR128XRegClass;

2983

break;

2984

}

2985

return std::make_pair(RRC, Cost);

2986

}

2987

2988

unsigned X86TargetLowering::getAddressSpace() const {

2989

if (Subtarget.is64Bit())

2990

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

2991

return 256;

2992

}

2993

2994

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

2995

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

2996

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

2997

}

2998

2999

static Constant* SegmentOffset(IRBuilderBase &IRB,

3000

int Offset, unsigned AddressSpace) {

3001

return ConstantExpr::getIntToPtr(

3002

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

3003

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

3004

}

3005

3006

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

3007

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

3008

// tcbhead_t; use it instead of the usual global variable (see

3009

// sysdeps/{i386,x86_64}/nptl/tls.h)

3010

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

3011

if (Subtarget.isTargetFuchsia()) {

3012

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

3013

return SegmentOffset(IRB, 0x10, getAddressSpace());

3014

} else {

3015

unsigned AddressSpace = getAddressSpace();

3016

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

3017

// Specially, some users may customize the base reg and offset.

3018

int Offset = M->getStackProtectorGuardOffset();

3019

// If we don't set -stack-protector-guard-offset value:

3020

// %fs:0x28, unless we're using a Kernel code model, in which case

3021

// it's %gs:0x28. gs:0x14 on i386.

3022

if (Offset == INT_MAX2147483647)

3023

Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

3024

3025

StringRef GuardReg = M->getStackProtectorGuardReg();

3026

if (GuardReg == "fs")

3027

AddressSpace = X86AS::FS;

3028

else if (GuardReg == "gs")

3029

AddressSpace = X86AS::GS;

3030

3031

// Use symbol guard if user specify.

3032

StringRef GuardSymb = M->getStackProtectorGuardSymbol();

3033

if (!GuardSymb.empty()) {

3034

GlobalVariable *GV = M->getGlobalVariable(GuardSymb);

3035

if (!GV) {

3036

Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())

3037

: Type::getInt32Ty(M->getContext());

3038

GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,

3039

nullptr, GuardSymb, nullptr,

3040

GlobalValue::NotThreadLocal, AddressSpace);

3041

}

3042

return GV;

3043

}

3044

3045

return SegmentOffset(IRB, Offset, AddressSpace);

3046

}

3047

}

3048

return TargetLowering::getIRStackGuard(IRB);

3049

}

3050

3051

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

3052

// MSVC CRT provides functionalities for stack protection.

3053

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3054

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3055

// MSVC CRT has a global variable holding security cookie.

3056

M.getOrInsertGlobal("__security_cookie",

3057

Type::getInt8PtrTy(M.getContext()));

3058

3059

// MSVC CRT has a function to validate security cookie.

3060

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

3061

"__security_check_cookie", Type::getVoidTy(M.getContext()),

3062

Type::getInt8PtrTy(M.getContext()));

3063

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

3064

F->setCallingConv(CallingConv::X86_FastCall);

3065

F->addParamAttr(0, Attribute::AttrKind::InReg);

3066

}

3067

return;

3068

}

3069

3070

StringRef GuardMode = M.getStackProtectorGuard();

3071

3072

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

3073

if ((GuardMode == "tls" || GuardMode.empty()) &&

3074

hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

3075

return;

3076

TargetLowering::insertSSPDeclarations(M);

3077

}

3078

3079

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

3080

// MSVC CRT has a global variable holding security cookie.

3081

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3082

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3083

return M.getGlobalVariable("__security_cookie");

3084

}

3085

return TargetLowering::getSDagStackGuard(M);

3086

}

3087

3088

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

3089

// MSVC CRT has a function to validate security cookie.

3090

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3091

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3092

return M.getFunction("__security_check_cookie");

3093

}

3094

return TargetLowering::getSSPStackGuardCheck(M);

3095

}

3096

3097

Value *

3098

X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

3099

if (Subtarget.getTargetTriple().isOSContiki())

3100

return getDefaultSafeStackPointerLocation(IRB, false);

3101

3102

// Android provides a fixed TLS slot for the SafeStack pointer. See the

3103

// definition of TLS_SLOT_SAFESTACK in

3104

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

3105

if (Subtarget.isTargetAndroid()) {

3106

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

3107

// %gs:0x24 on i386

3108

int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

3109

return SegmentOffset(IRB, Offset, getAddressSpace());

3110

}

3111

3112

// Fuchsia is similar.

3113

if (Subtarget.isTargetFuchsia()) {

3114

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

3115

return SegmentOffset(IRB, 0x18, getAddressSpace());

3116

}

3117

3118

return TargetLowering::getSafeStackPointerLocation(IRB);

3119

}

3120

3121

//===----------------------------------------------------------------------===//

3122

// Return Value Calling Convention Implementation

3123

//===----------------------------------------------------------------------===//

3124

3125

bool X86TargetLowering::CanLowerReturn(

3126

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

3127

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

3128

SmallVector<CCValAssign, 16> RVLocs;

3129

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

3130

return CCInfo.CheckReturn(Outs, RetCC_X86);

3131

}

3132

3133

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

3134

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

3135

return ScratchRegs;

3136

}

3137

3138

ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {

3139

// FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit

3140

// tests at the moment, which is not what we expected.

3141

static const MCPhysReg RCRegs[] = {X86::MXCSR};

3142

return RCRegs;

3143

}

3144

3145

/// Lowers masks values (v*i1) to the local register values

3146

/// \returns DAG node after lowering to register type

3147

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

3148

const SDLoc &Dl, SelectionDAG &DAG) {

3149

EVT ValVT = ValArg.getValueType();

3150

3151

if (ValVT == MVT::v1i1)

3152

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

3153

DAG.getIntPtrConstant(0, Dl));

3154

3155

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

3156

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

3157

// Two stage lowering might be required

3158

// bitcast: v8i1 -> i8 / v16i1 -> i16

3159

// anyextend: i8 -> i32 / i16 -> i32

3160

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

3161

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

3162

if (ValLoc == MVT::i32)

3163

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

3164

return ValToCopy;

3165

}

3166

3167

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

3168

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

3169

// One stage lowering is required

3170

// bitcast: v32i1 -> i32 / v64i1 -> i64

3171

return DAG.getBitcast(ValLoc, ValArg);

3172

}

3173

3174

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

3175

}

3176

3177

/// Breaks v64i1 value into two registers and adds the new node to the DAG

3178

static void Passv64i1ArgInRegs(

3179

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

3180

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

3181

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

3182

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__));

3183

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3183, __extension__
__PRETTY_FUNCTION__));

3184

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__));

3185

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))

3186

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__));

3187

3188

// Before splitting the value we cast it to i64

3189

Arg = DAG.getBitcast(MVT::i64, Arg);

3190

3191

// Splitting the value into two i32 types

3192

SDValue Lo, Hi;

3193

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

3194

DAG.getConstant(0, Dl, MVT::i32));

3195

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

3196

DAG.getConstant(1, Dl, MVT::i32));

3197

3198

// Attach the two i32 types into corresponding registers

3199

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

3200

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

3201

}

3202

3203

SDValue

3204

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

3205

bool isVarArg,

3206

const SmallVectorImpl<ISD::OutputArg> &Outs,

3207

const SmallVectorImpl<SDValue> &OutVals,

3208

const SDLoc &dl, SelectionDAG &DAG) const {

3209

MachineFunction &MF = DAG.getMachineFunction();

3210

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3211

3212

// In some cases we need to disable registers from the default CSR list.

3213

// For example, when they are used as return registers (preserve_* and X86's

3214

// regcall) or for argument passing (X86's regcall).

3215

bool ShouldDisableCalleeSavedRegister =

3216

shouldDisableRetRegFromCSR(CallConv) ||

3217

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

3218

3219

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

3220

report_fatal_error("X86 interrupts may not return any value");

3221

3222

SmallVector<CCValAssign, 16> RVLocs;

3223

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

3224

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

3225

3226

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

3227

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

3228

++I, ++OutsIndex) {

3229

CCValAssign &VA = RVLocs[I];

3230

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3230, __extension__
__PRETTY_FUNCTION__));

3231

3232

// Add the register to the CalleeSaveDisableRegs list.

3233

if (ShouldDisableCalleeSavedRegister)

3234

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

3235

3236

SDValue ValToCopy = OutVals[OutsIndex];

3237

EVT ValVT = ValToCopy.getValueType();

3238

3239

// Promote values to the appropriate types.

3240

if (VA.getLocInfo() == CCValAssign::SExt)

3241

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

3242

else if (VA.getLocInfo() == CCValAssign::ZExt)

3243

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

3244

else if (VA.getLocInfo() == CCValAssign::AExt) {

3245

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

3246

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

3247

else

3248

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

3249

}

3250

else if (VA.getLocInfo() == CCValAssign::BCvt)

3251

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

3252

3253

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3254, __extension__
__PRETTY_FUNCTION__))

3254

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3254, __extension__
__PRETTY_FUNCTION__));

3255

3256

// Report an error if we have attempted to return a value via an XMM

3257

// register and SSE was disabled.

3258

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3259

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3260

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3261

} else if (!Subtarget.hasSSE2() &&

3262

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3263

ValVT == MVT::f64) {

3264

// When returning a double via an XMM register, report an error if SSE2 is

3265

// not enabled.

3266

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3267

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3268

}

3269

3270

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

3271

// the RET instruction and handled by the FP Stackifier.

3272

if (VA.getLocReg() == X86::FP0 ||

3273

VA.getLocReg() == X86::FP1) {

3274

// If this is a copy from an xmm register to ST(0), use an FPExtend to

3275

// change the value to the FP stack register class.

3276

if (isScalarFPTypeInSSEReg(VA.getValVT()))

3277

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

3278

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3279

// Don't emit a copytoreg.

3280

continue;

3281

}

3282

3283

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

3284

// which is returned in RAX / RDX.

3285

if (Subtarget.is64Bit()) {

3286

if (ValVT == MVT::x86mmx) {

3287

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

3288

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

3289

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

3290

ValToCopy);

3291

// If we don't have SSE2 available, convert to v4f32 so the generated

3292

// register is legal.

3293

if (!Subtarget.hasSSE2())

3294

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

3295

}

3296

}

3297

}

3298

3299

if (VA.needsCustom()) {

3300

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3301, __extension__
__PRETTY_FUNCTION__))

3301

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3301, __extension__
__PRETTY_FUNCTION__));

3302

3303

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

3304

Subtarget);

3305

3306

// Add the second register to the CalleeSaveDisableRegs list.

3307

if (ShouldDisableCalleeSavedRegister)

3308

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

3309

} else {

3310

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3311

}

3312

}

3313

3314

SDValue Flag;

3315

SmallVector<SDValue, 6> RetOps;

3316

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

3317

// Operand #1 = Bytes To Pop

3318

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

3319

MVT::i32));

3320

3321

// Copy the result values into the output registers.

3322

for (auto &RetVal : RetVals) {

3323

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

3324

RetOps.push_back(RetVal.second);

3325

continue; // Don't emit a copytoreg.

3326

}

3327

3328

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);

3329

Flag = Chain.getValue(1);

3330

RetOps.push_back(

3331

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

3332

}

3333

3334

// Swift calling convention does not require we copy the sret argument

3335

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

3336

3337

// All x86 ABIs require that for returning structs by value we copy

3338

// the sret argument into %rax/%eax (depending on ABI) for the return.

3339

// We saved the argument into a virtual register in the entry block,

3340

// so now we copy the value out and into %rax/%eax.

3341

//

3342

// Checking Function.hasStructRetAttr() here is insufficient because the IR

3343

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

3344

// false, then an sret argument may be implicitly inserted in the SelDAG. In

3345

// either case FuncInfo->setSRetReturnReg() will have been called.

3346

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

3347

// When we have both sret and another return value, we should use the

3348

// original Chain stored in RetOps[0], instead of the current Chain updated

3349

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

3350

3351

// For the case of sret and another return value, we have

3352

// Chain_0 at the function entry

3353

// Chain_1 = getCopyToReg(Chain_0) in the above loop

3354

// If we use Chain_1 in getCopyFromReg, we will have

3355

// Val = getCopyFromReg(Chain_1)

3356

// Chain_2 = getCopyToReg(Chain_1, Val) from below

3357

3358

// getCopyToReg(Chain_0) will be glued together with

3359

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

3360

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

3361

// Data dependency from Unit B to Unit A due to usage of Val in

3362

// getCopyToReg(Chain_1, Val)

3363

// Chain dependency from Unit A to Unit B

3364

3365

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

3366

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

3367

getPointerTy(MF.getDataLayout()));

3368

3369

Register RetValReg

3370

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

3371

X86::RAX : X86::EAX;

3372

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

3373

Flag = Chain.getValue(1);

3374

3375

// RAX/EAX now acts like a return value.

3376

RetOps.push_back(

3377

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

3378

3379

// Add the returned register to the CalleeSaveDisableRegs list. Don't do

3380

// this however for preserve_most/preserve_all to minimize the number of

3381

// callee-saved registers for these CCs.

3382

if (ShouldDisableCalleeSavedRegister &&

3383

CallConv != CallingConv::PreserveAll &&

3384

CallConv != CallingConv::PreserveMost)

3385

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

3386

}

3387

3388

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

3389

const MCPhysReg *I =

3390

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

3391

if (I) {

3392

for (; *I; ++I) {

3393

if (X86::GR64RegClass.contains(*I))

3394

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

3395

else

3396

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3396);

3397

}

3398

}

3399

3400

RetOps[0] = Chain; // Update chain.

3401

3402

// Add the flag if we have it.

3403

if (Flag.getNode())

3404

RetOps.push_back(Flag);

3405

3406

X86ISD::NodeType opcode = X86ISD::RET_FLAG;

3407

if (CallConv == CallingConv::X86_INTR)

3408

opcode = X86ISD::IRET;

3409

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

3410

}

3411

3412

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

3413

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

3414

return false;

3415

3416

SDValue TCChain = Chain;

3417

SDNode *Copy = *N->use_begin();

3418

if (Copy->getOpcode() == ISD::CopyToReg) {

3419

// If the copy has a glue operand, we conservatively assume it isn't safe to

3420

// perform a tail call.

3421

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

3422

return false;

3423

TCChain = Copy->getOperand(0);

3424

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

3425

return false;

3426

3427

bool HasRet = false;

3428

for (const SDNode *U : Copy->uses()) {

3429

if (U->getOpcode() != X86ISD::RET_FLAG)

3430

return false;

3431

// If we are returning more than one value, we can definitely

3432

// not make a tail call see PR19530

3433

if (U->getNumOperands() > 4)

3434

return false;

3435

if (U->getNumOperands() == 4 &&

3436

U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)

3437

return false;

3438

HasRet = true;

3439

}

3440

3441

if (!HasRet)

3442

return false;

3443

3444

Chain = TCChain;

3445

return true;

3446

}

3447

3448

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

3449

ISD::NodeType ExtendKind) const {

3450

MVT ReturnMVT = MVT::i32;

3451

3452

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

3453

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

3454

// The ABI does not require i1, i8 or i16 to be extended.

3455

//

3456

// On Darwin, there is code in the wild relying on Clang's old behaviour of

3457

// always extending i8/i16 return values, so keep doing that for now.

3458

// (PR26665).

3459

ReturnMVT = MVT::i8;

3460

}

3461

3462

EVT MinVT = getRegisterType(Context, ReturnMVT);

3463

return VT.bitsLT(MinVT) ? MinVT : VT;

3464

}

3465

3466

/// Reads two 32 bit registers and creates a 64 bit mask value.

3467

/// \param VA The current 32 bit value that need to be assigned.

3468

/// \param NextVA The next 32 bit value that need to be assigned.

3469

/// \param Root The parent DAG node.

3470

/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for

3471

/// glue purposes. In the case the DAG is already using

3472

/// physical register instead of virtual, we should glue

3473

/// our new SDValue to InFlag SDvalue.

3474

/// \return a new SDvalue of size 64bit.

3475

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

3476

SDValue &Root, SelectionDAG &DAG,

3477

const SDLoc &Dl, const X86Subtarget &Subtarget,

3478

SDValue *InFlag = nullptr) {

3479

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__));

3480

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3480, __extension__
__PRETTY_FUNCTION__));

3481

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3482, __extension__
__PRETTY_FUNCTION__))

3482

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3482, __extension__
__PRETTY_FUNCTION__));

3483

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3484, __extension__
__PRETTY_FUNCTION__))

3484

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3484, __extension__
__PRETTY_FUNCTION__));

3485

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3486, __extension__
__PRETTY_FUNCTION__))

3486

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3486, __extension__
__PRETTY_FUNCTION__));

3487

3488

SDValue Lo, Hi;

3489

SDValue ArgValueLo, ArgValueHi;

3490

3491

MachineFunction &MF = DAG.getMachineFunction();

3492

const TargetRegisterClass *RC = &X86::GR32RegClass;

3493

3494

// Read a 32 bit value from the registers.

3495

if (nullptr == InFlag) {

3496

// When no physical register is present,

3497

// create an intermediate virtual register.

3498

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3499

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3500

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

3501

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3502

} else {

3503

// When a physical register is available read the value from it and glue

3504

// the reads together.

3505

ArgValueLo =

3506

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);

3507

*InFlag = ArgValueLo.getValue(2);

3508

ArgValueHi =

3509

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);

3510

*InFlag = ArgValueHi.getValue(2);

3511

}

3512

3513

// Convert the i32 type into v32i1 type.

3514

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

3515

3516

// Convert the i32 type into v32i1 type.

3517

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

3518

3519

// Concatenate the two values together.

3520

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

3521

}

3522

3523

/// The function will lower a register of various sizes (8/16/32/64)

3524

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

3525

/// \returns a DAG node contains the operand after lowering to mask type.

3526

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

3527

const EVT &ValLoc, const SDLoc &Dl,

3528

SelectionDAG &DAG) {

3529

SDValue ValReturned = ValArg;

3530

3531

if (ValVT == MVT::v1i1)

3532

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

3533

3534

if (ValVT == MVT::v64i1) {

3535

// In 32 bit machine, this case is handled by getv64i1Argument

3536

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3536, __extension__
__PRETTY_FUNCTION__));

3537

// In 64 bit machine, There is no need to truncate the value only bitcast

3538

} else {

3539

MVT maskLen;

3540

switch (ValVT.getSimpleVT().SimpleTy) {

3541

case MVT::v8i1:

3542

maskLen = MVT::i8;

3543

break;

3544

case MVT::v16i1:

3545

maskLen = MVT::i16;

3546

break;

3547

case MVT::v32i1:

3548

maskLen = MVT::i32;

3549

break;

3550

default:

3551

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3551);

3552

}

3553

3554

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

3555

}

3556

return DAG.getBitcast(ValVT, ValReturned);

3557

}

3558

3559

/// Lower the result values of a call into the

3560

/// appropriate copies out of appropriate physical registers.

3561

///

3562

SDValue X86TargetLowering::LowerCallResult(

3563

SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,

3564

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3565

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

3566

uint32_t *RegMask) const {

3567

3568

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3569

// Assign locations to each value returned by this call.

3570

SmallVector<CCValAssign, 16> RVLocs;

3571

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

3572

*DAG.getContext());

3573

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3574

3575

// Copy all of the result registers out of their specified physreg.

3576

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

3577

++I, ++InsIndex) {

3578

CCValAssign &VA = RVLocs[I];

3579

EVT CopyVT = VA.getLocVT();

3580

3581

// In some calling conventions we need to remove the used registers

3582

// from the register mask.

3583

if (RegMask) {

3584

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

3585

SubRegs.isValid(); ++SubRegs)

3586

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3587

}

3588

3589

// Report an error if there was an attempt to return FP values via XMM

3590

// registers.

3591

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3592

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3593

if (VA.getLocReg() == X86::XMM1)

3594

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3595

else

3596

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3597

} else if (!Subtarget.hasSSE2() &&

3598

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3599

CopyVT == MVT::f64) {

3600

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3601

if (VA.getLocReg() == X86::XMM1)

3602

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3603

else

3604

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3605

}

3606

3607

// If we prefer to use the value in xmm registers, copy it out as f80 and

3608

// use a truncate to move it from fp stack reg to xmm reg.

3609

bool RoundAfterCopy = false;

3610

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3611

isScalarFPTypeInSSEReg(VA.getValVT())) {

3612

if (!Subtarget.hasX87())

3613

report_fatal_error("X87 register return with X87 disabled");

3614

CopyVT = MVT::f80;

3615

RoundAfterCopy = (CopyVT != VA.getLocVT());

3616

}

3617

3618

SDValue Val;

3619

if (VA.needsCustom()) {

3620

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3621, __extension__
__PRETTY_FUNCTION__))

3621

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3621, __extension__
__PRETTY_FUNCTION__));

3622

Val =

3623

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);

3624

} else {

3625

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)

3626

.getValue(1);

3627

Val = Chain.getValue(0);

3628

InFlag = Chain.getValue(2);

3629

}

3630

3631

if (RoundAfterCopy)

3632

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3633

// This truncation won't change the value.

3634

DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));

3635

3636

if (VA.isExtInLoc()) {

3637

if (VA.getValVT().isVector() &&

3638

VA.getValVT().getScalarType() == MVT::i1 &&

3639

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3640

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3641

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3642

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3643

} else

3644

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3645

}

3646

3647

if (VA.getLocInfo() == CCValAssign::BCvt)

3648

Val = DAG.getBitcast(VA.getValVT(), Val);

3649

3650

InVals.push_back(Val);

3651

}

3652

3653

return Chain;

3654

}

3655

3656

//===----------------------------------------------------------------------===//

3657

// C & StdCall & Fast Calling Convention implementation

3658

//===----------------------------------------------------------------------===//

3659

// StdCall calling convention seems to be standard for many Windows' API

3660

// routines and around. It differs from C calling convention just a little:

3661

// callee should clean up the stack, not caller. Symbols should be also

3662

// decorated in some fancy way :) It doesn't support any vector arguments.

3663

// For info on fast calling convention see Fast Calling Convention (tail call)

3664

// implementation LowerX86_32FastCCCallTo.

3665

3666

/// Determines whether Args, either a set of outgoing arguments to a call, or a

3667

/// set of incoming args of a call, contains an sret pointer that the callee

3668

/// pops

3669

template <typename T>

3670

static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,

3671

const X86Subtarget &Subtarget) {

3672

// Not C++20 (yet), so no concepts available.

3673

static_assert(std::is_same_v<T, ISD::OutputArg> ||

3674

std::is_same_v<T, ISD::InputArg>,

3675

"requires ISD::OutputArg or ISD::InputArg");

3676

3677

// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out

3678

// for most compilations.

3679

if (!Subtarget.is32Bit())

3680

return false;

3681

3682

if (Args.empty())

3683

return false;

3684

3685

// Most calls do not have an sret argument, check the arg next.

3686

const ISD::ArgFlagsTy &Flags = Args[0].Flags;

3687

if (!Flags.isSRet() || Flags.isInReg())

3688

return false;

3689

3690

// The MSVCabi does not pop the sret.

3691

if (Subtarget.getTargetTriple().isOSMSVCRT())

3692

return false;

3693

3694

// MCUs don't pop the sret

3695

if (Subtarget.isTargetMCU())

3696

return false;

3697

3698

// Callee pops argument

3699

return true;

3700

}

3701

3702

/// Make a copy of an aggregate at address specified by "Src" to address

3703

/// "Dst" with size and alignment information specified by the specific

3704

/// parameter attribute. The copy will be passed as a byval function parameter.

3705

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3706

SDValue Chain, ISD::ArgFlagsTy Flags,

3707

SelectionDAG &DAG, const SDLoc &dl) {

3708

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3709

3710

return DAG.getMemcpy(

3711

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3712

/*isVolatile*/ false, /*AlwaysInline=*/true,

3713

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3714

}

3715

3716

/// Return true if the calling convention is one that we can guarantee TCO for.

3717

static bool canGuaranteeTCO(CallingConv::ID CC) {

3718

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3719

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3720

CC == CallingConv::Tail || CC == CallingConv::SwiftTail);

3721

}

3722

3723

/// Return true if we might ever do TCO for calls with this calling convention.

3724

static bool mayTailCallThisCC(CallingConv::ID CC) {

3725

switch (CC) {

3726

// C calling conventions:

3727

case CallingConv::C:

3728

case CallingConv::Win64:

3729

case CallingConv::X86_64_SysV:

3730

// Callee pop conventions:

3731

case CallingConv::X86_ThisCall:

3732

case CallingConv::X86_StdCall:

3733

case CallingConv::X86_VectorCall:

3734

case CallingConv::X86_FastCall:

3735

// Swift:

3736

case CallingConv::Swift:

3737

return true;

3738

default:

3739

return canGuaranteeTCO(CC);

3740

}

3741

}

3742

3743

/// Return true if the function is being made into a tailcall target by

3744

/// changing its ABI.

3745

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3746

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||

3747

CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

3748

}

3749

3750

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3751

if (!CI->isTailCall())

3752

return false;

3753

3754

CallingConv::ID CalleeCC = CI->getCallingConv();

3755

if (!mayTailCallThisCC(CalleeCC))

3756

return false;

3757

3758

return true;

3759

}

3760

3761

SDValue

3762

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3763

const SmallVectorImpl<ISD::InputArg> &Ins,

3764

const SDLoc &dl, SelectionDAG &DAG,

3765

const CCValAssign &VA,

3766

MachineFrameInfo &MFI, unsigned i) const {

3767

// Create the nodes corresponding to a load from this parameter slot.

3768

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3769

bool AlwaysUseMutable = shouldGuaranteeTCO(

3770

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3771

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3772

EVT ValVT;

3773

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3774

3775

// If value is passed by pointer we have address passed instead of the value

3776

// itself. No need to extend if the mask value and location share the same

3777

// absolute size.

3778

bool ExtendedInMem =

3779

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3780

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3781

3782

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3783

ValVT = VA.getLocVT();

3784

else

3785

ValVT = VA.getValVT();

3786

3787

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3788

// changed with more analysis.

3789

// In case of tail call optimization mark all arguments mutable. Since they

3790

// could be overwritten by lowering of arguments in case of a tail call.

3791

if (Flags.isByVal()) {

3792

unsigned Bytes = Flags.getByValSize();

3793

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3794

3795

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3796

// can be improved with deeper analysis.

3797

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3798

/*isAliased=*/true);

3799

return DAG.getFrameIndex(FI, PtrVT);

3800

}

3801

3802

EVT ArgVT = Ins[i].ArgVT;

3803

3804

// If this is a vector that has been split into multiple parts, and the

3805

// scalar size of the parts don't match the vector element size, then we can't

3806

// elide the copy. The parts will have padding between them instead of being

3807

// packed like a vector.

3808

bool ScalarizedAndExtendedVector =

3809

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3810

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3811

3812

// This is an argument in memory. We might be able to perform copy elision.

3813

// If the argument is passed directly in memory without any extension, then we

3814

// can perform copy elision. Large vector types, for example, may be passed

3815

// indirectly by pointer.

3816

if (Flags.isCopyElisionCandidate() &&

3817

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3818

!ScalarizedAndExtendedVector) {

3819

SDValue PartAddr;

3820

if (Ins[i].PartOffset == 0) {

3821

// If this is a one-part value or the first part of a multi-part value,

3822

// create a stack object for the entire argument value type and return a

3823

// load from our portion of it. This assumes that if the first part of an

3824

// argument is in memory, the rest will also be in memory.

3825

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3826

/*IsImmutable=*/false);

3827

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3828

return DAG.getLoad(

3829

ValVT, dl, Chain, PartAddr,

3830

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3831

} else {

3832

// This is not the first piece of an argument in memory. See if there is

3833

// already a fixed stack object including this offset. If so, assume it

3834

// was created by the PartOffset == 0 branch above and create a load from

3835

// the appropriate offset into it.

3836

int64_t PartBegin = VA.getLocMemOffset();

3837

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3838

int FI = MFI.getObjectIndexBegin();

3839

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3840

int64_t ObjBegin = MFI.getObjectOffset(FI);

3841

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3842

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3843

break;

3844

}

3845

if (MFI.isFixedObjectIndex(FI)) {

3846

SDValue Addr =

3847

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3848

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3849

return DAG.getLoad(

3850

ValVT, dl, Chain, Addr,

3851

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3852

Ins[i].PartOffset));

3853

}

3854

}

3855

}

3856

3857

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3858

VA.getLocMemOffset(), isImmutable);

3859

3860

// Set SExt or ZExt flag.

3861

if (VA.getLocInfo() == CCValAssign::ZExt) {

3862

MFI.setObjectZExt(FI, true);

3863

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3864

MFI.setObjectSExt(FI, true);

3865

}

3866

3867

MaybeAlign Alignment;

3868

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

3869

ValVT != MVT::f80)

3870

Alignment = MaybeAlign(4);

3871

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3872

SDValue Val = DAG.getLoad(

3873

ValVT, dl, Chain, FIN,

3874

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

3875

Alignment);

3876

return ExtendedInMem

3877

? (VA.getValVT().isVector()

3878

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3879

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3880

: Val;

3881

}

3882

3883

// FIXME: Get this from tablegen.

3884

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3885

const X86Subtarget &Subtarget) {

3886

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3886, __extension__ __PRETTY_FUNCTION__));

3887

3888

if (Subtarget.isCallingConvWin64(CallConv)) {

3889

static const MCPhysReg GPR64ArgRegsWin64[] = {

3890

X86::RCX, X86::RDX, X86::R8, X86::R9

3891

};

3892

return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3893

}

3894

3895

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3896

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3897

};

3898

return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3899

}

3900

3901

// FIXME: Get this from tablegen.

3902

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3903

CallingConv::ID CallConv,

3904

const X86Subtarget &Subtarget) {

3905

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3905, __extension__ __PRETTY_FUNCTION__));

3906

if (Subtarget.isCallingConvWin64(CallConv)) {

3907

// The XMM registers which might contain var arg parameters are shadowed

3908

// in their paired GPR. So we only need to save the GPR to their home

3909

// slots.

3910

// TODO: __vectorcall will change this.

3911

return std::nullopt;

3912

}

3913

3914

bool isSoftFloat = Subtarget.useSoftFloat();

3915

if (isSoftFloat || !Subtarget.hasSSE1())

3916

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3917

// registers.

3918

return std::nullopt;

3919

3920

static const MCPhysReg XMMArgRegs64Bit[] = {

3921

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3922

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3923

};

3924

return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3925

}

3926

3927

#ifndef NDEBUG

3928

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3929

return llvm::is_sorted(

3930

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3931

return A.getValNo() < B.getValNo();

3932

});

3933

}

3934

#endif

3935

3936

namespace {

3937

/// This is a helper class for lowering variable arguments parameters.

3938

class VarArgsLoweringHelper {

3939

public:

3940

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3941

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3942

CallingConv::ID CallConv, CCState &CCInfo)

3943

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3944

TheMachineFunction(DAG.getMachineFunction()),

3945

TheFunction(TheMachineFunction.getFunction()),

3946

FrameInfo(TheMachineFunction.getFrameInfo()),

3947

FrameLowering(*Subtarget.getFrameLowering()),

3948

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3949

CCInfo(CCInfo) {}

3950

3951

// Lower variable arguments parameters.

3952

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3953

3954

private:

3955

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3956

3957

void forwardMustTailParameters(SDValue &Chain);

3958

3959

bool is64Bit() const { return Subtarget.is64Bit(); }

3960

bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

3961

3962

X86MachineFunctionInfo *FuncInfo;

3963

const SDLoc &DL;

3964

SelectionDAG &DAG;

3965

const X86Subtarget &Subtarget;

3966

MachineFunction &TheMachineFunction;

3967

const Function &TheFunction;

3968

MachineFrameInfo &FrameInfo;

3969

const TargetFrameLowering &FrameLowering;

3970

const TargetLowering &TargLowering;

3971

CallingConv::ID CallConv;

3972

CCState &CCInfo;

3973

};

3974

} // namespace

3975

3976

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3977

SDValue &Chain, unsigned StackSize) {

3978

// If the function takes variable number of arguments, make a frame index for

3979

// the start of the first vararg value... for expansion of llvm.va_start. We

3980

// can skip this if there are no va_start calls.

3981

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3982

CallConv != CallingConv::X86_ThisCall)) {

3983

FuncInfo->setVarArgsFrameIndex(

3984

FrameInfo.CreateFixedObject(1, StackSize, true));

3985

}

3986

3987

// 64-bit calling conventions support varargs and register parameters, so we

3988

// have to do extra work to spill them in the prologue.

3989

if (is64Bit()) {

3990

// Find the first unallocated argument registers.

3991

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3992

ArrayRef<MCPhysReg> ArgXMMs =

3993

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

3994

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3995

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3996

3997

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3998, __extension__
__PRETTY_FUNCTION__))

3998

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3998, __extension__
__PRETTY_FUNCTION__));

3999

4000

if (isWin64()) {

4001

// Get to the caller-allocated home save location. Add 8 to account

4002

// for the return address.

4003

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

4004

FuncInfo->setRegSaveFrameIndex(

4005

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

4006

// Fixup to set vararg frame on shadow area (4 x i64).

4007

if (NumIntRegs < 4)

4008

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

4009

} else {

4010

// For X86-64, if there are vararg parameters that are passed via

4011

// registers, then we must store them to their spots on the stack so

4012

// they may be loaded by dereferencing the result of va_next.

4013

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

4014

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

4015

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

4016

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

4017

}

4018

4019

SmallVector<SDValue, 6>

4020

LiveGPRs; // list of SDValue for GPR registers keeping live input value

4021

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

4022

// keeping live input value

4023

SDValue ALVal; // if applicable keeps SDValue for %al register

4024

4025

// Gather all the live in physical registers.

4026

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

4027

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

4028

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

4029

}

4030

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

4031

if (!AvailableXmms.empty()) {

4032

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4033

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

4034

for (MCPhysReg Reg : AvailableXmms) {

4035

// FastRegisterAllocator spills virtual registers at basic

4036

// block boundary. That leads to usages of xmm registers

4037

// outside of check for %al. Pass physical registers to

4038

// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.

4039

TheMachineFunction.getRegInfo().addLiveIn(Reg);

4040

LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));

4041

}

4042

}

4043

4044

// Store the integer parameter registers.

4045

SmallVector<SDValue, 8> MemOps;

4046

SDValue RSFIN =

4047

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

4048

TargLowering.getPointerTy(DAG.getDataLayout()));

4049

unsigned Offset = FuncInfo->getVarArgsGPOffset();

4050

for (SDValue Val : LiveGPRs) {

4051

SDValue FIN = DAG.getNode(ISD::ADD, DL,

4052

TargLowering.getPointerTy(DAG.getDataLayout()),

4053

RSFIN, DAG.getIntPtrConstant(Offset, DL));

4054

SDValue Store =

4055

DAG.getStore(Val.getValue(1), DL, Val, FIN,

4056

MachinePointerInfo::getFixedStack(

4057

DAG.getMachineFunction(),

4058

FuncInfo->getRegSaveFrameIndex(), Offset));

4059

MemOps.push_back(Store);

4060

Offset += 8;

4061

}

4062

4063

// Now store the XMM (fp + vector) parameter registers.

4064

if (!LiveXMMRegs.empty()) {

4065

SmallVector<SDValue, 12> SaveXMMOps;

4066

SaveXMMOps.push_back(Chain);

4067

SaveXMMOps.push_back(ALVal);

4068

SaveXMMOps.push_back(RSFIN);

4069

SaveXMMOps.push_back(

4070

DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));

4071

llvm::append_range(SaveXMMOps, LiveXMMRegs);

4072

MachineMemOperand *StoreMMO =

4073

DAG.getMachineFunction().getMachineMemOperand(

4074

MachinePointerInfo::getFixedStack(

4075

DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),

4076

Offset),

4077

MachineMemOperand::MOStore, 128, Align(16));

4078

MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,

4079

DL, DAG.getVTList(MVT::Other),

4080

SaveXMMOps, MVT::i8, StoreMMO));

4081

}

4082

4083

if (!MemOps.empty())

4084

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

4085

}

4086

}

4087

4088

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

4089

// Find the largest legal vector type.

4090

MVT VecVT = MVT::Other;

4091

// FIXME: Only some x86_32 calling conventions support AVX512.

4092

if (Subtarget.useAVX512Regs() &&

4093

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

4094

CallConv == CallingConv::Intel_OCL_BI)))

4095

VecVT = MVT::v16f32;

4096

else if (Subtarget.hasAVX())

4097

VecVT = MVT::v8f32;

4098

else if (Subtarget.hasSSE2())

4099

VecVT = MVT::v4f32;

4100

4101

// We forward some GPRs and some vector types.

4102

SmallVector<MVT, 2> RegParmTypes;

4103

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

4104

RegParmTypes.push_back(IntVT);

4105

if (VecVT != MVT::Other)

4106

RegParmTypes.push_back(VecVT);

4107

4108

// Compute the set of forwarded registers. The rest are scratch.

4109

SmallVectorImpl<ForwardedRegister> &Forwards =

4110

FuncInfo->getForwardedMustTailRegParms();

4111

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

4112

4113

// Forward AL for SysV x86_64 targets, since it is used for varargs.

4114

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

4115

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4116

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

4117

}

4118

4119

// Copy all forwards from physical to virtual registers.

4120

for (ForwardedRegister &FR : Forwards) {

4121

// FIXME: Can we use a less constrained schedule?

4122

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

4123

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

4124

TargLowering.getRegClassFor(FR.VT));

4125

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

4126

}

4127

}

4128

4129

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

4130

unsigned StackSize) {

4131

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

4132

// If necessary, it would be set into the correct value later.

4133

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

4134

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4135

4136

if (FrameInfo.hasVAStart())

4137

createVarArgAreaAndStoreRegisters(Chain, StackSize);

4138

4139

if (FrameInfo.hasMustTailInVarArgFunc())

4140

forwardMustTailParameters(Chain);

4141

}

4142

4143

SDValue X86TargetLowering::LowerFormalArguments(

4144

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

4145

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

4146

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

4147

MachineFunction &MF = DAG.getMachineFunction();

4148

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4149

4150

const Function &F = MF.getFunction();

4151

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

4152

F.getName() == "main")

4153

FuncInfo->setForceFramePointer(true);

4154

4155

MachineFrameInfo &MFI = MF.getFrameInfo();

4156

bool Is64Bit = Subtarget.is64Bit();

4157

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4158

4159

assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4161, __extension__
__PRETTY_FUNCTION__))

4160

!(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4161, __extension__
__PRETTY_FUNCTION__))

4161

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4161, __extension__
__PRETTY_FUNCTION__));

4162

4163

// Assign locations to all of the incoming arguments.

4164

SmallVector<CCValAssign, 16> ArgLocs;

4165

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

4166

4167

// Allocate shadow area for Win64.

4168

if (IsWin64)

4169

CCInfo.AllocateStack(32, Align(8));

4170

4171

CCInfo.AnalyzeArguments(Ins, CC_X86);

4172

4173

// In vectorcall calling convention a second pass is required for the HVA

4174

// types.

4175

if (CallingConv::X86_VectorCall == CallConv) {

4176

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

4177

}

4178

4179

// The next loop assumes that the locations are in the same order of the

4180

// input arguments.

4181

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4182, __extension__
__PRETTY_FUNCTION__))

4182

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4182, __extension__
__PRETTY_FUNCTION__));

4183

4184

SDValue ArgValue;

4185

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

4186

++I, ++InsIndex) {

4187

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4187, __extension__
__PRETTY_FUNCTION__));

4188

CCValAssign &VA = ArgLocs[I];

4189

4190

if (VA.isRegLoc()) {

4191

EVT RegVT = VA.getLocVT();

4192

if (VA.needsCustom()) {

4193

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))

4194

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__))

4195

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__
__PRETTY_FUNCTION__));

4196

4197

// v64i1 values, in regcall calling convention, that are

4198

// compiled to 32 bit arch, are split up into two registers.

4199

ArgValue =

4200

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

4201

} else {

4202

const TargetRegisterClass *RC;

4203

if (RegVT == MVT::i8)

4204

RC = &X86::GR8RegClass;

4205

else if (RegVT == MVT::i16)

4206

RC = &X86::GR16RegClass;

4207

else if (RegVT == MVT::i32)

4208

RC = &X86::GR32RegClass;

4209

else if (Is64Bit && RegVT == MVT::i64)

4210

RC = &X86::GR64RegClass;

4211

else if (RegVT == MVT::f16)

4212

RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;

4213

else if (RegVT == MVT::f32)

4214

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

4215

else if (RegVT == MVT::f64)

4216

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

4217

else if (RegVT == MVT::f80)

4218

RC = &X86::RFP80RegClass;

4219

else if (RegVT == MVT::f128)

4220

RC = &X86::VR128RegClass;

4221

else if (RegVT.is512BitVector())

4222

RC = &X86::VR512RegClass;

4223

else if (RegVT.is256BitVector())

4224

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

4225

else if (RegVT.is128BitVector())

4226

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

4227

else if (RegVT == MVT::x86mmx)

4228

RC = &X86::VR64RegClass;

4229

else if (RegVT == MVT::v1i1)

4230

RC = &X86::VK1RegClass;

4231

else if (RegVT == MVT::v8i1)

4232

RC = &X86::VK8RegClass;

4233

else if (RegVT == MVT::v16i1)

4234

RC = &X86::VK16RegClass;

4235

else if (RegVT == MVT::v32i1)

4236

RC = &X86::VK32RegClass;

4237

else if (RegVT == MVT::v64i1)

4238

RC = &X86::VK64RegClass;

4239

else

4240

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4240);

4241

4242

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

4243

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

4244

}

4245

4246

// If this is an 8 or 16-bit value, it is really passed promoted to 32

4247

// bits. Insert an assert[sz]ext to capture this, then truncate to the

4248

// right size.

4249

if (VA.getLocInfo() == CCValAssign::SExt)

4250

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

4251

DAG.getValueType(VA.getValVT()));

4252

else if (VA.getLocInfo() == CCValAssign::ZExt)

4253

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

4254

DAG.getValueType(VA.getValVT()));

4255

else if (VA.getLocInfo() == CCValAssign::BCvt)

4256

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

4257

4258

if (VA.isExtInLoc()) {

4259

// Handle MMX values passed in XMM regs.

4260

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

4261

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

4262

else if (VA.getValVT().isVector() &&

4263

VA.getValVT().getScalarType() == MVT::i1 &&

4264

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

4265

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

4266

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

4267

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

4268

} else

4269

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

4270

}

4271

} else {

4272

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4272, __extension__ __PRETTY_FUNCTION__));

4273

ArgValue =

4274

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

4275

}

4276

4277

// If value is passed via pointer - do a load.

4278

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

4279

ArgValue =

4280

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

4281

4282

InVals.push_back(ArgValue);

4283

}

4284

4285

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

4286

if (Ins[I].Flags.isSwiftAsync()) {

4287

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

4288

if (Subtarget.is64Bit())

4289

X86FI->setHasSwiftAsyncContext(true);

4290

else {

4291

int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

4292

X86FI->setSwiftAsyncContextFrameIdx(FI);

4293

SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],

4294

DAG.getFrameIndex(FI, MVT::i32),

4295

MachinePointerInfo::getFixedStack(MF, FI));

4296

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);

4297

}

4298

}

4299

4300

// Swift calling convention does not require we copy the sret argument

4301

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

4302

if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)

4303

continue;

4304

4305

// All x86 ABIs require that for returning structs by value we copy the

4306

// sret argument into %rax/%eax (depending on ABI) for the return. Save

4307

// the argument into a virtual register so that we can access it from the

4308

// return points.

4309

if (Ins[I].Flags.isSRet()) {

4310

assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4311, __extension__
__PRETTY_FUNCTION__))

4311

"SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4311, __extension__
__PRETTY_FUNCTION__));

4312

MVT PtrTy = getPointerTy(DAG.getDataLayout());

4313

Register Reg =

4314

MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

4315

FuncInfo->setSRetReturnReg(Reg);

4316

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

4317

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

4318

break;

4319

}

4320

}

4321

4322

unsigned StackSize = CCInfo.getNextStackOffset();

4323

// Align stack specially for tail calls.

4324

if (shouldGuaranteeTCO(CallConv,

4325

MF.getTarget().Options.GuaranteedTailCallOpt))

4326

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

4327

4328

if (IsVarArg)

4329

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

4330

.lowerVarArgsParameters(Chain, StackSize);

4331

4332

// Some CCs need callee pop.

4333

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

4334

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4335

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

4336

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

4337

// X86 interrupts must pop the error code (and the alignment padding) if

4338

// present.

4339

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

4340

} else {

4341

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

4342

// If this is an sret function, the return should pop the hidden pointer.

4343

if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))

4344

FuncInfo->setBytesToPopOnReturn(4);

4345

}

4346

4347

if (!Is64Bit) {

4348

// RegSaveFrameIndex is X86-64 only.

4349

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4350

}

4351

4352

FuncInfo->setArgumentStackSize(StackSize);

4353

4354

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

4355

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

4356

if (Personality == EHPersonality::CoreCLR) {

4357

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4357,
__extension__ __PRETTY_FUNCTION__));

4358

// TODO: Add a mechanism to frame lowering that will allow us to indicate

4359

// that we'd prefer this slot be allocated towards the bottom of the frame

4360

// (i.e. near the stack pointer after allocating the frame). Every

4361

// funclet needs a copy of this slot in its (mostly empty) frame, and the

4362

// offset from the bottom of this and each funclet's frame must be the

4363

// same, so the size of funclets' (mostly empty) frames is dictated by

4364

// how far this slot is from the bottom (since they allocate just enough

4365

// space to accommodate holding this slot at the correct offset).

4366

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);

4367

EHInfo->PSPSymFrameIdx = PSPSymFI;

4368

}

4369

}

4370

4371

if (shouldDisableArgRegFromCSR(CallConv) ||

4372

F.hasFnAttribute("no_caller_saved_registers")) {

4373

MachineRegisterInfo &MRI = MF.getRegInfo();

4374

for (std::pair<Register, Register> Pair : MRI.liveins())

4375

MRI.disableCalleeSavedRegister(Pair.first);

4376

}

4377

4378

return Chain;

4379

}

4380

4381

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

4382

SDValue Arg, const SDLoc &dl,

4383

SelectionDAG &DAG,

4384

const CCValAssign &VA,

4385

ISD::ArgFlagsTy Flags,

4386

bool isByVal) const {

4387

unsigned LocMemOffset = VA.getLocMemOffset();

4388

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

4389

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4390

StackPtr, PtrOff);

4391

if (isByVal)

4392

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

4393

4394

MaybeAlign Alignment;

4395

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

4396

Arg.getSimpleValueType() != MVT::f80)

4397

Alignment = MaybeAlign(4);

4398

return DAG.getStore(

4399

Chain, dl, Arg, PtrOff,

4400

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),

4401

Alignment);

4402

}

4403

4404

/// Emit a load of return address if tail call

4405

/// optimization is performed and it is required.

4406

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

4407

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

4408

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

4409

// Adjust the Return address stack slot.

4410

EVT VT = getPointerTy(DAG.getDataLayout());

4411

OutRetAddr = getReturnAddressFrameIndex(DAG);

4412

4413

// Load the "old" Return address.

4414

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

4415

return SDValue(OutRetAddr.getNode(), 1);

4416

}

4417

4418

/// Emit a store of the return address if tail call

4419

/// optimization is performed and it is required (FPDiff!=0).

4420

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

4421

SDValue Chain, SDValue RetAddrFrIdx,

4422

EVT PtrVT, unsigned SlotSize,

4423

int FPDiff, const SDLoc &dl) {

4424

// Store the return address to the appropriate stack slot.

4425

if (!FPDiff) return Chain;

4426

// Calculate the new stack slot for the return address.

4427

int NewReturnAddrFI =

4428

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

4429

false);

4430

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

4431

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

4432

MachinePointerInfo::getFixedStack(

4433

DAG.getMachineFunction(), NewReturnAddrFI));

4434

return Chain;

4435

}

4436

4437

/// Returns a vector_shuffle mask for an movs{s|d}, movd

4438

/// operation of specified width.

4439

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

4440

SDValue V2) {

4441

unsigned NumElems = VT.getVectorNumElements();

4442

SmallVector<int, 8> Mask;

4443

Mask.push_back(NumElems);

4444

for (unsigned i = 1; i != NumElems; ++i)

4445

Mask.push_back(i);

4446

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

4447

}

4448

4449

SDValue

4450

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

4451

SmallVectorImpl<SDValue> &InVals) const {

4452

SelectionDAG &DAG = CLI.DAG;

4453

SDLoc &dl = CLI.DL;

4454

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

4455

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

4456

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

4457

SDValue Chain = CLI.Chain;

4458

SDValue Callee = CLI.Callee;

4459

CallingConv::ID CallConv = CLI.CallConv;

4460

bool &isTailCall = CLI.IsTailCall;

4461

bool isVarArg = CLI.IsVarArg;

4462

const auto *CB = CLI.CB;

4463

4464

MachineFunction &MF = DAG.getMachineFunction();

4465

bool Is64Bit = Subtarget.is64Bit();

4466

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4467

bool IsSibcall = false;

4468

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

4469

CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;

4470

bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);

4471

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

4472

bool HasNCSR = (CB && isa<CallInst>(CB) &&

4473

CB->hasFnAttr("no_caller_saved_registers"));

4474

bool HasNoCfCheck = (CB && CB->doesNoCfCheck());

4475

bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());

4476

bool IsCFICall = IsIndirectCall && CLI.CFIType;

4477

const Module *M = MF.getMMI().getModule();

4478

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

4479

4480

MachineFunction::CallSiteInfo CSInfo;

4481

if (CallConv == CallingConv::X86_INTR)

4482

report_fatal_error("X86 interrupts may not be called directly");

4483

4484

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

4485

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {

4486

// If we are using a GOT, disable tail calls to external symbols with

4487

// default visibility. Tail calling such a symbol requires using a GOT

4488

// relocation, which forces early binding of the symbol. This breaks code

4489

// that require lazy function symbol resolution. Using musttail or

4490

// GuaranteedTailCallOpt will override this.

4491

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4492

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

4493

G->getGlobal()->hasDefaultVisibility()))

4494

isTailCall = false;

4495

}

4496

4497

if (isTailCall && !IsMustTail) {

4498

// Check if it's really possible to do a tail call.

4499

isTailCall = IsEligibleForTailCallOptimization(

4500

Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,

4501

Ins, DAG);

4502

4503

// Sibcalls are automatically detected tailcalls which do not require

4504

// ABI changes.

4505

if (!IsGuaranteeTCO && isTailCall)

4506

IsSibcall = true;

4507

4508

if (isTailCall)

4509

++NumTailCalls;

4510

}

4511

4512

if (IsMustTail && !isTailCall)

4513

report_fatal_error("failed to perform tail call elimination on a call "

4514

"site marked musttail");

4515

4516

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4517, __extension__
__PRETTY_FUNCTION__))

4517

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4517, __extension__
__PRETTY_FUNCTION__));

4518

4519

// Analyze operands of the call, assigning locations to each operand.

4520

SmallVector<CCValAssign, 16> ArgLocs;

4521

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

4522

4523

// Allocate shadow area for Win64.

4524

if (IsWin64)

4525

CCInfo.AllocateStack(32, Align(8));

4526

4527

CCInfo.AnalyzeArguments(Outs, CC_X86);

4528

4529

// In vectorcall calling convention a second pass is required for the HVA

4530

// types.

4531

if (CallingConv::X86_VectorCall == CallConv) {

4532

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

4533

}

4534

4535

// Get a count of how many bytes are to be pushed on the stack.

4536

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

4537

if (IsSibcall)

4538

// This is a sibcall. The memory operands are available in caller's

4539

// own caller's stack.

4540

NumBytes = 0;

4541

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

4542

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

4543

4544

int FPDiff = 0;

4545

if (isTailCall &&

4546

shouldGuaranteeTCO(CallConv,

4547

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4548

// Lower arguments at fp - stackoffset + fpdiff.

4549

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

4550

4551

FPDiff = NumBytesCallerPushed - NumBytes;

4552

4553

// Set the delta of movement of the returnaddr stackslot.

4554

// But only set if delta is greater than previous delta.

4555

if (FPDiff < X86Info->getTCReturnAddrDelta())

4556

X86Info->setTCReturnAddrDelta(FPDiff);

4557

}

4558

4559

unsigned NumBytesToPush = NumBytes;

4560

unsigned NumBytesToPop = NumBytes;

4561

4562

// If we have an inalloca argument, all stack space has already been allocated

4563

// for us and be right at the top of the stack. We don't support multiple

4564

// arguments passed in memory when using inalloca.

4565

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

4566

NumBytesToPush = 0;

4567

if (!ArgLocs.back().isMemLoc())

4568

report_fatal_error("cannot use inalloca attribute on a register "

4569

"parameter");

4570

if (ArgLocs.back().getLocMemOffset() != 0)

4571

report_fatal_error("any parameter with the inalloca attribute must be "

4572

"the only memory argument");

4573

} else if (CLI.IsPreallocated) {

4574

assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4576, __extension__
__PRETTY_FUNCTION__))

4575

"cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4576, __extension__
__PRETTY_FUNCTION__))

4576

"parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4576, __extension__
__PRETTY_FUNCTION__));

4577

SmallVector<size_t, 4> PreallocatedOffsets;

4578

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

4579

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

4580

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

4581

}

4582

}

4583

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

4584

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

4585

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

4586

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

4587

NumBytesToPush = 0;

4588

}

4589

4590

if (!IsSibcall && !IsMustTail)

4591

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

4592

NumBytes - NumBytesToPush, dl);

4593

4594

SDValue RetAddrFrIdx;

4595

// Load return address for tail calls.

4596

if (isTailCall && FPDiff)

4597

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

4598

Is64Bit, FPDiff, dl);

4599

4600

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

4601

SmallVector<SDValue, 8> MemOpChains;

4602

SDValue StackPtr;

4603

4604

// The next loop assumes that the locations are in the same order of the

4605

// input arguments.

4606

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4607, __extension__
__PRETTY_FUNCTION__))

4607

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4607, __extension__
__PRETTY_FUNCTION__));

4608

4609

// Walk the register/memloc assignments, inserting copies/loads. In the case

4610

// of tail call optimization arguments are handle later.

4611

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4612

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4613

++I, ++OutIndex) {

4614

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4614, __extension__
__PRETTY_FUNCTION__));

4615

// Skip inalloca/preallocated arguments, they have already been written.

4616

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4617

if (Flags.isInAlloca() || Flags.isPreallocated())

4618

continue;

4619

4620

CCValAssign &VA = ArgLocs[I];

4621

EVT RegVT = VA.getLocVT();

4622

SDValue Arg = OutVals[OutIndex];

4623

bool isByVal = Flags.isByVal();

4624

4625

// Promote the value if needed.

4626

switch (VA.getLocInfo()) {

4627

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4627);

4628

case CCValAssign::Full: break;

4629

case CCValAssign::SExt:

4630

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4631

break;

4632

case CCValAssign::ZExt:

4633

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4634

break;

4635

case CCValAssign::AExt:

4636

if (Arg.getValueType().isVector() &&

4637

Arg.getValueType().getVectorElementType() == MVT::i1)

4638

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4639

else if (RegVT.is128BitVector()) {

4640

// Special case: passing MMX values in XMM registers.

4641

Arg = DAG.getBitcast(MVT::i64, Arg);

4642

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4643

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4644

} else

4645

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4646

break;

4647

case CCValAssign::BCvt:

4648

Arg = DAG.getBitcast(RegVT, Arg);

4649

break;

4650

case CCValAssign::Indirect: {

4651

if (isByVal) {

4652

// Memcpy the argument to a temporary stack slot to prevent

4653

// the caller from seeing any modifications the callee may make

4654

// as guaranteed by the `byval` attribute.

4655

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4656

Flags.getByValSize(),

4657

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4658

SDValue StackSlot =

4659

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4660

Chain =

4661

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4662

// From now on treat this as a regular pointer

4663

Arg = StackSlot;

4664

isByVal = false;

4665

} else {

4666

// Store the argument.

4667

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4668

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4669

Chain = DAG.getStore(

4670

Chain, dl, Arg, SpillSlot,

4671

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4672

Arg = SpillSlot;

4673

}

4674

break;

4675

}

4676

}

4677

4678

if (VA.needsCustom()) {

4679

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4680, __extension__
__PRETTY_FUNCTION__))

4680

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4680, __extension__
__PRETTY_FUNCTION__));

4681

// Split v64i1 value into two registers

4682

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4683

} else if (VA.isRegLoc()) {

4684

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4685

const TargetOptions &Options = DAG.getTarget().Options;

4686

if (Options.EmitCallSiteInfo)

4687

CSInfo.emplace_back(VA.getLocReg(), I);

4688

if (isVarArg && IsWin64) {

4689

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4690

// shadow reg if callee is a varargs function.

4691

Register ShadowReg;

4692

switch (VA.getLocReg()) {

4693

case X86::XMM0: ShadowReg = X86::RCX; break;

4694

case X86::XMM1: ShadowReg = X86::RDX; break;

4695

case X86::XMM2: ShadowReg = X86::R8; break;

4696

case X86::XMM3: ShadowReg = X86::R9; break;

4697

}

4698

if (ShadowReg)

4699

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4700

}

4701

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4702

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4702, __extension__ __PRETTY_FUNCTION__));

4703

if (!StackPtr.getNode())

4704

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4705

getPointerTy(DAG.getDataLayout()));

4706

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4707

dl, DAG, VA, Flags, isByVal));

4708

}

4709

}

4710

4711

if (!MemOpChains.empty())

4712

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4713

4714

if (Subtarget.isPICStyleGOT()) {

4715

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4716

// GOT pointer (except regcall).

4717

if (!isTailCall) {

4718

// Indirect call with RegCall calling convertion may use up all the

4719

// general registers, so it is not suitable to bind EBX reister for

4720

// GOT address, just let register allocator handle it.

4721

if (CallConv != CallingConv::X86_RegCall)

4722

RegsToPass.push_back(std::make_pair(

4723

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4724

getPointerTy(DAG.getDataLayout()))));

4725

} else {

4726

// If we are tail calling and generating PIC/GOT style code load the

4727

// address of the callee into ECX. The value in ecx is used as target of

4728

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4729

// for tail calls on PIC/GOT architectures. Normally we would just put the

4730

// address of GOT into ebx and then call target@PLT. But for tail calls

4731

// ebx would be restored (since ebx is callee saved) before jumping to the

4732

// target@PLT.

4733

4734

// Note: The actual moving to ECX is done further down.

4735

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4736

if (G && !G->getGlobal()->hasLocalLinkage() &&

4737

G->getGlobal()->hasDefaultVisibility())

4738

Callee = LowerGlobalAddress(Callee, DAG);

4739

else if (isa<ExternalSymbolSDNode>(Callee))

4740

Callee = LowerExternalSymbol(Callee, DAG);

4741

}

4742

}

4743

4744

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&

4745

(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {

4746

// From AMD64 ABI document:

4747

// For calls that may call functions that use varargs or stdargs

4748

// (prototype-less calls or calls to functions containing ellipsis (...) in

4749

// the declaration) %al is used as hidden argument to specify the number

4750

// of SSE registers used. The contents of %al do not need to match exactly

4751

// the number of registers, but must be an ubound on the number of SSE

4752

// registers used and is in the range 0 - 8 inclusive.

4753

4754

// Count the number of XMM registers allocated.

4755

static const MCPhysReg XMMArgRegs[] = {

4756

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4757

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4758

};

4759

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4760

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4761, __extension__
__PRETTY_FUNCTION__))

4761

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4761, __extension__
__PRETTY_FUNCTION__));

4762

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4763

DAG.getConstant(NumXMMRegs, dl,

4764

MVT::i8)));

4765

}

4766

4767

if (isVarArg && IsMustTail) {

4768

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4769

for (const auto &F : Forwards) {

4770

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4771

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4772

}

4773

}

4774

4775

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4776

// don't need this because the eligibility check rejects calls that require

4777

// shuffling arguments passed in memory.

4778

if (!IsSibcall && isTailCall) {

4779

// Force all the incoming stack arguments to be loaded from the stack

4780

// before any new outgoing arguments are stored to the stack, because the

4781

// outgoing stack slots may alias the incoming argument stack slots, and

4782

// the alias isn't otherwise explicit. This is slightly more conservative

4783

// than necessary, because it means that each store effectively depends

4784

// on every argument instead of just those arguments it would clobber.

4785

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4786

4787

SmallVector<SDValue, 8> MemOpChains2;

4788

SDValue FIN;

4789

int FI = 0;

4790

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4791

++I, ++OutsIndex) {

4792

CCValAssign &VA = ArgLocs[I];

4793

4794

if (VA.isRegLoc()) {

4795

if (VA.needsCustom()) {

4796

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4797, __extension__
__PRETTY_FUNCTION__))

4797

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4797, __extension__
__PRETTY_FUNCTION__));

4798

// This means that we are in special case where one argument was

4799

// passed through two register locations - Skip the next location

4800

++I;

4801

}

4802

4803

continue;

4804

}

4805

4806

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4806, __extension__ __PRETTY_FUNCTION__));

4807

SDValue Arg = OutVals[OutsIndex];

4808

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4809

// Skip inalloca/preallocated arguments. They don't require any work.

4810

if (Flags.isInAlloca() || Flags.isPreallocated())

4811

continue;

4812

// Create frame index.

4813

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4814

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4815

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4816

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4817

4818

if (Flags.isByVal()) {

4819

// Copy relative to framepointer.

4820

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4821

if (!StackPtr.getNode())

4822

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4823

getPointerTy(DAG.getDataLayout()));

4824

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4825

StackPtr, Source);

4826

4827

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4828

ArgChain,

4829

Flags, DAG, dl));

4830

} else {

4831

// Store relative to framepointer.

4832

MemOpChains2.push_back(DAG.getStore(

4833

ArgChain, dl, Arg, FIN,

4834

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4835

}

4836

}

4837

4838

if (!MemOpChains2.empty())

4839

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4840

4841

// Store the return address to the appropriate stack slot.

4842

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4843

getPointerTy(DAG.getDataLayout()),

4844

RegInfo->getSlotSize(), FPDiff, dl);

4845

}

4846

4847

// Build a sequence of copy-to-reg nodes chained together with token chain

4848

// and flag operands which copy the outgoing args into registers.

4849

SDValue InFlag;

4850

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4851

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4852

RegsToPass[i].second, InFlag);

4853

InFlag = Chain.getValue(1);

4854

}

4855

4856

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4857

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4857, __extension__
__PRETTY_FUNCTION__));

4858

// In the 64-bit large code model, we have to make all calls

4859

// through a register, since the call instruction's 32-bit

4860

// pc-relative offset may not be large enough to hold the whole

4861

// address.

4862

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4863

Callee->getOpcode() == ISD::ExternalSymbol) {

4864

// Lower direct calls to global addresses and external symbols. Setting

4865

// ForCall to true here has the effect of removing WrapperRIP when possible

4866

// to allow direct calls to be selected without first materializing the

4867

// address into a register.

4868

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4869

} else if (Subtarget.isTarget64BitILP32() &&

4870

Callee.getValueType() == MVT::i32) {

4871

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4872

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4873

}

4874

4875

// Returns a chain & a flag for retval copy to use.

4876

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4877

SmallVector<SDValue, 8> Ops;

4878

4879

if (!IsSibcall && isTailCall && !IsMustTail) {

4880

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InFlag, dl);

4881

InFlag = Chain.getValue(1);

4882

}

4883

4884

Ops.push_back(Chain);

4885

Ops.push_back(Callee);

4886

4887

if (isTailCall)

4888

Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

4889

4890

// Add argument registers to the end of the list so that they are known live

4891

// into the call.

4892

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4893

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4894

RegsToPass[i].second.getValueType()));

4895

4896

// Add a register mask operand representing the call-preserved registers.

4897

const uint32_t *Mask = [&]() {

4898

auto AdaptedCC = CallConv;

4899

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),

4900

// use X86_INTR calling convention because it has the same CSR mask

4901

// (same preserved registers).

4902

if (HasNCSR)

4903

AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;

4904

// If NoCalleeSavedRegisters is requested, than use GHC since it happens

4905

// to use the CSR_NoRegs_RegMask.

4906

if (CB && CB->hasFnAttr("no_callee_saved_registers"))

4907

AdaptedCC = (CallingConv::ID)CallingConv::GHC;

4908

return RegInfo->getCallPreservedMask(MF, AdaptedCC);

4909

}();

4910

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4910, __extension__
__PRETTY_FUNCTION__));

4911

4912

// If this is an invoke in a 32-bit function using a funclet-based

4913

// personality, assume the function clobbers all registers. If an exception

4914

// is thrown, the runtime will not restore CSRs.

4915

// FIXME: Model this more precisely so that we can register allocate across

4916

// the normal edge and spill and fill across the exceptional edge.

4917

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4918

const Function &CallerFn = MF.getFunction();

4919

EHPersonality Pers =

4920

CallerFn.hasPersonalityFn()

4921

? classifyEHPersonality(CallerFn.getPersonalityFn())

4922

: EHPersonality::Unknown;

4923

if (isFuncletEHPersonality(Pers))

4924

Mask = RegInfo->getNoPreservedMask();

4925

}

4926

4927

// Define a new register mask from the existing mask.

4928

uint32_t *RegMask = nullptr;

4929

4930

// In some calling conventions we need to remove the used physical registers

4931

// from the reg mask. Create a new RegMask for such calling conventions.

4932

// RegMask for calling conventions that disable only return registers (e.g.

4933

// preserve_most) will be modified later in LowerCallResult.

4934

bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;

4935

if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {

4936

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4937

4938

// Allocate a new Reg Mask and copy Mask.

4939

RegMask = MF.allocateRegMask();

4940

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4941

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4942

4943

// Make sure all sub registers of the argument registers are reset

4944

// in the RegMask.

4945

if (ShouldDisableArgRegs) {

4946

for (auto const &RegPair : RegsToPass)

4947

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

4948

SubRegs.isValid(); ++SubRegs)

4949

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

4950

}

4951

4952

// Create the RegMask Operand according to our updated mask.

4953

Ops.push_back(DAG.getRegisterMask(RegMask));

4954

} else {

4955

// Create the RegMask Operand according to the static mask.

4956

Ops.push_back(DAG.getRegisterMask(Mask));

4957

}

4958

4959

if (InFlag.getNode())

4960

Ops.push_back(InFlag);

4961

4962

if (isTailCall) {

4963

// We used to do:

4964

//// If this is the first return lowered for this function, add the regs

4965

//// to the liveout set for the function.

4966

// This isn't right, although it's probably harmless on x86; liveouts

4967

// should be computed from returns not tail calls. Consider a void

4968

// function making a tail call to a function returning int.

4969

MF.getFrameInfo().setHasTailCall();

4970

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4971

4972

if (IsCFICall)

4973

Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4974

4975

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4976

return Ret;

4977

}

4978

4979

if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {

4980

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4981

} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

4982

// Calls with a "clang.arc.attachedcall" bundle are special. They should be

4983

// expanded to the call, directly followed by a special marker sequence and

4984

// a call to a ObjC library function. Use the CALL_RVMARKER to do that.

4985

assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4986, __extension__
__PRETTY_FUNCTION__))

4986

"tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4986, __extension__
__PRETTY_FUNCTION__));

4987

assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4987, __extension__
__PRETTY_FUNCTION__));

4988

4989

// Add a target global address for the retainRV/claimRV runtime function

4990

// just before the call target.

4991

Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

4992

auto PtrVT = getPointerTy(DAG.getDataLayout());

4993

auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);

4994

Ops.insert(Ops.begin() + 1, GA);

4995

Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);

4996

} else {

4997

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

4998

}

4999

5000

if (IsCFICall)

5001

Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());

5002

5003

InFlag = Chain.getValue(1);

5004

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

5005

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

5006

5007

// Save heapallocsite metadata.

5008

if (CLI.CB)

5009

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

5010

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

5011

5012

// Create the CALLSEQ_END node.

5013

unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.

5014

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

5015

DAG.getTarget().Options.GuaranteedTailCallOpt))

5016

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

5017

else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)

5018

// If this call passes a struct-return pointer, the callee

5019

// pops that struct pointer.

5020

NumBytesForCalleeToPop = 4;

5021

5022

// Returns a flag for retval copy to use.

5023

if (!IsSibcall) {

5024

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,

5025

InFlag, dl);

5026

InFlag = Chain.getValue(1);

5027

}

5028

5029

// Handle result values, copying them out of physregs into vregs that we

5030

// return.

5031

return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,

5032

InVals, RegMask);

5033

}

5034

5035

//===----------------------------------------------------------------------===//

5036

// Fast Calling Convention (tail call) implementation

5037

//===----------------------------------------------------------------------===//

5038

5039

// Like std call, callee cleans arguments, convention except that ECX is

5040

// reserved for storing the tail called function address. Only 2 registers are

5041

// free for argument passing (inreg). Tail call optimization is performed

5042

// provided:

5043

// * tailcallopt is enabled

5044

// * caller/callee are fastcc

5045

// On X86_64 architecture with GOT-style position independent code only local

5046

// (within module) calls are supported at the moment.

5047

// To keep the stack aligned according to platform abi the function

5048

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

5049

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

5050

// If a tail called function callee has more arguments than the caller the

5051

// caller needs to make sure that there is room to move the RETADDR to. This is

5052

// achieved by reserving an area the size of the argument delta right after the

5053

// original RETADDR, but before the saved framepointer or the spilled registers

5054

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

5055

// stack layout:

5056

// arg1

5057

// arg2

5058

// RETADDR

5059

// [ new RETADDR

5060

// move area ]

5061

// (possible EBP)

5062

// ESI

5063

// EDI

5064

// local1 ..

5065

5066

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

5067

/// requirement.

5068

unsigned

5069

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

5070

SelectionDAG &DAG) const {

5071

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

5072

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

5073

assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5074, __extension__
__PRETTY_FUNCTION__))

5074

"StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5074, __extension__
__PRETTY_FUNCTION__));

5075

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

5076

}

5077

5078

/// Return true if the given stack call argument is already available in the

5079

/// same position (relatively) of the caller's incoming argument stack.

5080

static

5081

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

5082

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

5083

const X86InstrInfo *TII, const CCValAssign &VA) {

5084

unsigned Bytes = Arg.getValueSizeInBits() / 8;

5085

5086

for (;;) {

5087

// Look through nodes that don't alter the bits of the incoming value.

5088

unsigned Op = Arg.getOpcode();

5089

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

5090

Arg = Arg.getOperand(0);

5091

continue;

5092

}

5093

if (Op == ISD::TRUNCATE) {

5094

const SDValue &TruncInput = Arg.getOperand(0);

5095

if (TruncInput.getOpcode() == ISD::AssertZext &&

5096

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

5097

Arg.getValueType()) {

5098

Arg = TruncInput.getOperand(0);

5099

continue;

5100

}

5101

}

5102

break;

5103

}

5104

5105

int FI = INT_MAX2147483647;

5106

if (Arg.getOpcode() == ISD::CopyFromReg) {

5107

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

5108

if (!VR.isVirtual())

5109

return false;

5110

MachineInstr *Def = MRI->getVRegDef(VR);

5111

if (!Def)

5112

return false;

5113

if (!Flags.isByVal()) {

5114

if (!TII->isLoadFromStackSlot(*Def, FI))

5115

return false;

5116

} else {

5117

unsigned Opcode = Def->getOpcode();

5118

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

5119

Opcode == X86::LEA64_32r) &&

5120

Def->getOperand(1).isFI()) {

5121

FI = Def->getOperand(1).getIndex();

5122

Bytes = Flags.getByValSize();

5123

} else

5124

return false;

5125

}

5126

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

5127

if (Flags.isByVal())

5128

// ByVal argument is passed in as a pointer but it's now being

5129

// dereferenced. e.g.

5130

// define @foo(%struct.X* %A) {

5131

// tail call @bar(%struct.X* byval %A)

5132

// }

5133

return false;

5134

SDValue Ptr = Ld->getBasePtr();

5135

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

5136

if (!FINode)

5137

return false;

5138

FI = FINode->getIndex();

5139

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

5140

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

5141

FI = FINode->getIndex();

5142

Bytes = Flags.getByValSize();

5143

} else

5144

return false;

5145

5146

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5146, __extension__ __PRETTY_FUNCTION__));

5147

if (!MFI.isFixedObjectIndex(FI))

5148

return false;

5149

5150

if (Offset != MFI.getObjectOffset(FI))

5151

return false;

5152

5153

// If this is not byval, check that the argument stack object is immutable.

5154

// inalloca and argument copy elision can create mutable argument stack

5155

// objects. Byval objects can be mutated, but a byval call intends to pass the

5156

// mutated memory.

5157

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

5158

return false;

5159

5160

if (VA.getLocVT().getFixedSizeInBits() >

5161

Arg.getValueSizeInBits().getFixedValue()) {

5162

// If the argument location is wider than the argument type, check that any

5163

// extension flags match.

5164

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

5165

Flags.isSExt() != MFI.isObjectSExt(FI)) {

5166

return false;

5167

}

5168

}

5169

5170

return Bytes == MFI.getObjectSize(FI);

5171

}

5172

5173

/// Check whether the call is eligible for tail call optimization. Targets

5174

/// that want to do tail call optimization should implement this function.

5175

bool X86TargetLowering::IsEligibleForTailCallOptimization(

5176

SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,

5177

bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,

5178

const SmallVectorImpl<SDValue> &OutVals,

5179

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

5180

if (!mayTailCallThisCC(CalleeCC))

5181

return false;

5182

5183

// If -tailcallopt is specified, make fastcc functions tail-callable.

5184

MachineFunction &MF = DAG.getMachineFunction();

5185

const Function &CallerF = MF.getFunction();

5186

5187

// If the function return type is x86_fp80 and the callee return type is not,

5188

// then the FP_EXTEND of the call result is not a nop. It's not safe to

5189

// perform a tailcall optimization here.

5190

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

5191

return false;

5192

5193

CallingConv::ID CallerCC = CallerF.getCallingConv();

5194

bool CCMatch = CallerCC == CalleeCC;

5195

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

5196

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

5197

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

5198

CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

5199

5200

// Win64 functions have extra shadow space for argument homing. Don't do the

5201

// sibcall if the caller and callee have mismatched expectations for this

5202

// space.

5203

if (IsCalleeWin64 != IsCallerWin64)

5204

return false;

5205

5206

if (IsGuaranteeTCO) {

5207

if (canGuaranteeTCO(CalleeCC) && CCMatch)

5208

return true;

5209

return false;

5210

}

5211

5212

// Look for obvious safe cases to perform tail call optimization that do not

5213

// require ABI changes. This is what gcc calls sibcall.

5214

5215

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

5216

// emit a special epilogue.

5217

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5218

if (RegInfo->hasStackRealignment(MF))

5219

return false;

5220

5221

// Also avoid sibcall optimization if we're an sret return fn and the callee

5222

// is incompatible. See comment in LowerReturn about why hasStructRetAttr is

5223

// insufficient.

5224

if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {

5225

// For a compatible tail call the callee must return our sret pointer. So it

5226

// needs to be (a) an sret function itself and (b) we pass our sret as its

5227

// sret. Condition #b is harder to determine.

5228

return false;

5229

} else if (IsCalleePopSRet)

5230

// The callee pops an sret, so we cannot tail-call, as our caller doesn't

5231

// expect that.

5232

return false;

5233

5234

// Do not sibcall optimize vararg calls unless all arguments are passed via

5235

// registers.

5236

LLVMContext &C = *DAG.getContext();

5237

if (isVarArg && !Outs.empty()) {

5238

// Optimizing for varargs on Win64 is unlikely to be safe without

5239

// additional testing.

5240

if (IsCalleeWin64 || IsCallerWin64)

5241

return false;

5242

5243

SmallVector<CCValAssign, 16> ArgLocs;

5244

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5245

5246

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5247

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

5248

if (!ArgLocs[i].isRegLoc())

5249

return false;

5250

}

5251

5252

// If the call result is in ST0 / ST1, it needs to be popped off the x87

5253

// stack. Therefore, if it's not used by the call it is not safe to optimize

5254

// this into a sibcall.

5255

bool Unused = false;

5256

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

5257

if (!Ins[i].Used) {

5258

Unused = true;

5259

break;

5260

}

5261

}

5262

if (Unused) {

5263

SmallVector<CCValAssign, 16> RVLocs;

5264

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

5265

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

5266

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

5267

CCValAssign &VA = RVLocs[i];

5268

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

5269

return false;

5270

}

5271

}

5272

5273

// Check that the call results are passed in the same way.

5274

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

5275

RetCC_X86, RetCC_X86))

5276

return false;

5277

// The callee has to preserve all registers the caller needs to preserve.

5278

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

5279

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

5280

if (!CCMatch) {

5281

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

5282

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

5283

return false;

5284

}

5285

5286

unsigned StackArgsSize = 0;

5287

5288

// If the callee takes no arguments then go on to check the results of the

5289

// call.

5290

if (!Outs.empty()) {

5291

// Check if stack adjustment is needed. For now, do not do this if any

5292

// argument is passed on the stack.

5293

SmallVector<CCValAssign, 16> ArgLocs;

5294

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5295

5296

// Allocate shadow area for Win64

5297

if (IsCalleeWin64)

5298

CCInfo.AllocateStack(32, Align(8));

5299

5300

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5301

StackArgsSize = CCInfo.getNextStackOffset();

5302

5303

if (CCInfo.getNextStackOffset()) {

5304

// Check if the arguments are already laid out in the right way as

5305

// the caller's fixed stack objects.

5306

MachineFrameInfo &MFI = MF.getFrameInfo();

5307

const MachineRegisterInfo *MRI = &MF.getRegInfo();

5308

const X86InstrInfo *TII = Subtarget.getInstrInfo();

5309

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5310

CCValAssign &VA = ArgLocs[i];

5311

SDValue Arg = OutVals[i];

5312

ISD::ArgFlagsTy Flags = Outs[i].Flags;

5313

if (VA.getLocInfo() == CCValAssign::Indirect)

5314

return false;

5315

if (!VA.isRegLoc()) {

5316

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

5317

MFI, MRI, TII, VA))

5318

return false;

5319

}

5320

}

5321

}

5322

5323

bool PositionIndependent = isPositionIndependent();

5324

// If the tailcall address may be in a register, then make sure it's

5325

// possible to register allocate for it. In 32-bit, the call address can

5326

// only target EAX, EDX, or ECX since the tail call must be scheduled after

5327

// callee-saved registers are restored. These happen to be the same

5328

// registers used to pass 'inreg' arguments so watch out for those.

5329

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

5330

!isa<ExternalSymbolSDNode>(Callee)) ||

5331

PositionIndependent)) {

5332

unsigned NumInRegs = 0;

5333

// In PIC we need an extra register to formulate the address computation

5334

// for the callee.

5335

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

5336

5337

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5338

CCValAssign &VA = ArgLocs[i];

5339

if (!VA.isRegLoc())

5340

continue;

5341

Register Reg = VA.getLocReg();

5342

switch (Reg) {

5343

default: break;

5344

case X86::EAX: case X86::EDX: case X86::ECX:

5345

if (++NumInRegs == MaxInRegs)

5346

return false;

5347

break;

5348

}

5349

}

5350

}

5351

5352

const MachineRegisterInfo &MRI = MF.getRegInfo();

5353

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

5354

return false;

5355

}

5356

5357

bool CalleeWillPop =

5358

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

5359

MF.getTarget().Options.GuaranteedTailCallOpt);

5360

5361

if (unsigned BytesToPop =

5362

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

5363

// If we have bytes to pop, the callee must pop them.

5364

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

5365

if (!CalleePopMatches)

5366

return false;

5367

} else if (CalleeWillPop && StackArgsSize > 0) {

5368

// If we don't have bytes to pop, make sure the callee doesn't pop any.

5369

return false;

5370

}

5371

5372

return true;

5373

}

5374

5375

FastISel *

5376

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

5377

const TargetLibraryInfo *libInfo) const {

5378

return X86::createFastISel(funcInfo, libInfo);

5379

}

5380

5381

//===----------------------------------------------------------------------===//

5382

// Other Lowering Hooks

5383

//===----------------------------------------------------------------------===//

5384

5385

bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

5386

bool AssumeSingleUse) {

5387

if (!AssumeSingleUse && !Op.hasOneUse())

5388

return false;

5389

if (!ISD::isNormalLoad(Op.getNode()))

5390

return false;

5391

5392

// If this is an unaligned vector, make sure the target supports folding it.

5393

auto *Ld = cast<LoadSDNode>(Op.getNode());

5394

if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

5395

Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))

5396

return false;

5397

5398

// TODO: If this is a non-temporal load and the target has an instruction

5399

// for it, it should not be folded. See "useNonTemporalLoad()".

5400

5401

return true;

5402

}

5403

5404

bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

5405

const X86Subtarget &Subtarget,

5406

bool AssumeSingleUse) {

5407

assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5407, __extension__
__PRETTY_FUNCTION__));

5408

if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

5409

return false;

5410

5411

// We can not replace a wide volatile load with a broadcast-from-memory,

5412

// because that would narrow the load, which isn't legal for volatiles.

5413

auto *Ld = cast<LoadSDNode>(Op.getNode());

5414

return !Ld->isVolatile() ||

5415

Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

5416

}

5417

5418

bool X86::mayFoldIntoStore(SDValue Op) {

5419

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

5420

}

5421

5422

bool X86::mayFoldIntoZeroExtend(SDValue Op) {

5423

if (Op.hasOneUse()) {

5424

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

5425

return (ISD::ZERO_EXTEND == Opcode);

5426

}

5427

return false;

5428

}

5429

5430

static bool isTargetShuffle(unsigned Opcode) {

5431

switch(Opcode) {

5432

default: return false;

5433

case X86ISD::BLENDI:

5434

case X86ISD::PSHUFB:

5435

case X86ISD::PSHUFD:

5436

case X86ISD::PSHUFHW:

5437

case X86ISD::PSHUFLW:

5438

case X86ISD::SHUFP:

5439

case X86ISD::INSERTPS:

5440

case X86ISD::EXTRQI:

5441

case X86ISD::INSERTQI:

5442

case X86ISD::VALIGN:

5443

case X86ISD::PALIGNR:

5444

case X86ISD::VSHLDQ:

5445

case X86ISD::VSRLDQ:

5446

case X86ISD::MOVLHPS:

5447

case X86ISD::MOVHLPS:

5448

case X86ISD::MOVSHDUP:

5449

case X86ISD::MOVSLDUP:

5450

case X86ISD::MOVDDUP:

5451

case X86ISD::MOVSS:

5452

case X86ISD::MOVSD:

5453

case X86ISD::MOVSH:

5454

case X86ISD::UNPCKL:

5455

case X86ISD::UNPCKH:

5456

case X86ISD::VBROADCAST:

5457

case X86ISD::VPERMILPI:

5458

case X86ISD::VPERMILPV:

5459

case X86ISD::VPERM2X128:

5460

case X86ISD::SHUF128:

5461

case X86ISD::VPERMIL2:

5462

case X86ISD::VPERMI:

5463

case X86ISD::VPPERM:

5464

case X86ISD::VPERMV:

5465

case X86ISD::VPERMV3:

5466

case X86ISD::VZEXT_MOVL:

5467

return true;

5468

}

5469

}

5470

5471

static bool isTargetShuffleVariableMask(unsigned Opcode) {

5472

switch (Opcode) {

5473

default: return false;

5474

// Target Shuffles.

5475

case X86ISD::PSHUFB:

5476

case X86ISD::VPERMILPV:

5477

case X86ISD::VPERMIL2:

5478

case X86ISD::VPPERM:

5479

case X86ISD::VPERMV:

5480

case X86ISD::VPERMV3:

5481

return true;

5482

// 'Faux' Target Shuffles.

5483

case ISD::OR:

5484

case ISD::AND:

5485

case X86ISD::ANDNP:

5486

return true;

5487

}

5488

}

5489

5490

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

5491

MachineFunction &MF = DAG.getMachineFunction();

5492

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5493

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

5494

int ReturnAddrIndex = FuncInfo->getRAIndex();

5495

5496

if (ReturnAddrIndex == 0) {

5497

// Set up a frame object for the return address.

5498

unsigned SlotSize = RegInfo->getSlotSize();

5499

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

5500

-(int64_t)SlotSize,

5501

false);

5502

FuncInfo->setRAIndex(ReturnAddrIndex);

5503

}

5504

5505

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

5506

}

5507

5508

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

5509

bool hasSymbolicDisplacement) {

5510

// Offset should fit into 32 bit immediate field.

5511

if (!isInt<32>(Offset))

5512

return false;

5513

5514

// If we don't have a symbolic displacement - we don't have any extra

5515

// restrictions.

5516

if (!hasSymbolicDisplacement)

5517

return true;

5518

5519

// FIXME: Some tweaks might be needed for medium code model.

5520

if (M != CodeModel::Small && M != CodeModel::Kernel)

5521

return false;

5522

5523

// For small code model we assume that latest object is 16MB before end of 31

5524

// bits boundary. We may also accept pretty large negative constants knowing

5525

// that all objects are in the positive half of address space.

5526

if (M == CodeModel::Small && Offset < 16*1024*1024)

5527

return true;

5528

5529

// For kernel code model we know that all object resist in the negative half

5530

// of 32bits address space. We may not accept negative offsets, since they may

5531

// be just off and we may accept pretty large positive ones.

5532

if (M == CodeModel::Kernel && Offset >= 0)

5533

return true;

5534

5535

return false;

5536

}

5537

5538

/// Determines whether the callee is required to pop its own arguments.

5539

/// Callee pop is necessary to support tail calls.

5540

bool X86::isCalleePop(CallingConv::ID CallingConv,

5541

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

5542

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

5543

// can guarantee TCO.

5544

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

5545

return true;

5546

5547

switch (CallingConv) {

5548

default:

5549

return false;

5550

case CallingConv::X86_StdCall:

5551

case CallingConv::X86_FastCall:

5552

case CallingConv::X86_ThisCall:

5553

case CallingConv::X86_VectorCall:

5554

return !is64Bit;

5555

}

5556

}

5557

5558

/// Return true if the condition is an signed comparison operation.

5559

static bool isX86CCSigned(unsigned X86CC) {

5560

switch (X86CC) {

5561

default:

5562

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5562);

5563

case X86::COND_E:

5564

case X86::COND_NE:

5565

case X86::COND_B:

5566

case X86::COND_A:

5567

case X86::COND_BE:

5568

case X86::COND_AE:

5569

return false;

5570

case X86::COND_G:

5571

case X86::COND_GE:

5572

case X86::COND_L:

5573

case X86::COND_LE:

5574

return true;

5575

}

5576

}

5577

5578

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

5579

switch (SetCCOpcode) {

5580

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5580);

5581

case ISD::SETEQ: return X86::COND_E;

5582

case ISD::SETGT: return X86::COND_G;

5583

case ISD::SETGE: return X86::COND_GE;

5584

case ISD::SETLT: return X86::COND_L;

5585

case ISD::SETLE: return X86::COND_LE;

5586

case ISD::SETNE: return X86::COND_NE;

5587

case ISD::SETULT: return X86::COND_B;

5588

case ISD::SETUGT: return X86::COND_A;

5589

case ISD::SETULE: return X86::COND_BE;

5590

case ISD::SETUGE: return X86::COND_AE;

5591

}

5592

}

5593

5594

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

5595

/// condition code, returning the condition code and the LHS/RHS of the

5596

/// comparison to make.

5597

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

5598

bool isFP, SDValue &LHS, SDValue &RHS,

5599

SelectionDAG &DAG) {

5600

if (!isFP) {

5601

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

5602

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

5603

// X > -1 -> X == 0, jump !sign.

5604

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5605

return X86::COND_NS;

5606

}

5607

if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

5608

// X < 0 -> X == 0, jump on sign.

5609

return X86::COND_S;

5610

}

5611

if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

5612

// X >= 0 -> X == 0, jump on !sign.

5613

return X86::COND_NS;

5614

}

5615

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

5616

// X < 1 -> X <= 0

5617

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5618

return X86::COND_LE;

5619

}

5620

}

5621

5622

return TranslateIntegerX86CC(SetCCOpcode);

5623

}

5624

5625

// First determine if it is required or is profitable to flip the operands.

5626

5627

// If LHS is a foldable load, but RHS is not, flip the condition.

5628

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

5629

!ISD::isNON_EXTLoad(RHS.getNode())) {

5630

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

5631

std::swap(LHS, RHS);

5632

}

5633

5634

switch (SetCCOpcode) {

5635

default: break;

5636

case ISD::SETOLT:

5637

case ISD::SETOLE:

5638

case ISD::SETUGT:

5639

case ISD::SETUGE:

5640

std::swap(LHS, RHS);

5641

break;

5642

}

5643

5644

// On a floating point condition, the flags are set as follows:

5645

// ZF PF CF op

5646

// 0 | 0 | 0 | X > Y

5647

// 0 | 0 | 1 | X < Y

5648

// 1 | 0 | 0 | X == Y

5649

// 1 | 1 | 1 | unordered

5650

switch (SetCCOpcode) {

5651

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5651);

5652

case ISD::SETUEQ:

5653

case ISD::SETEQ: return X86::COND_E;

5654

case ISD::SETOLT: // flipped

5655

case ISD::SETOGT:

5656

case ISD::SETGT: return X86::COND_A;

5657

case ISD::SETOLE: // flipped

5658

case ISD::SETOGE:

5659

case ISD::SETGE: return X86::COND_AE;

5660

case ISD::SETUGT: // flipped

5661

case ISD::SETULT:

5662

case ISD::SETLT: return X86::COND_B;

5663

case ISD::SETUGE: // flipped

5664

case ISD::SETULE:

5665

case ISD::SETLE: return X86::COND_BE;

5666

case ISD::SETONE:

5667

case ISD::SETNE: return X86::COND_NE;

5668

case ISD::SETUO: return X86::COND_P;

5669

case ISD::SETO: return X86::COND_NP;

5670

case ISD::SETOEQ:

5671

case ISD::SETUNE: return X86::COND_INVALID;

5672

}

5673

}

5674

5675

/// Is there a floating point cmov for the specific X86 condition code?

5676

/// Current x86 isa includes the following FP cmov instructions:

5677

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5678

static bool hasFPCMov(unsigned X86CC) {

5679

switch (X86CC) {

5680

default:

5681

return false;

5682

case X86::COND_B:

5683

case X86::COND_BE:

5684

case X86::COND_E:

5685

case X86::COND_P:

5686

case X86::COND_A:

5687

case X86::COND_AE:

5688

case X86::COND_NE:

5689

case X86::COND_NP:

5690

return true;

5691

}

5692

}

5693

5694

static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

5695

return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

5696

VT.is512BitVector();

5697

}

5698

5699

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5700

const CallInst &I,

5701

MachineFunction &MF,

5702

unsigned Intrinsic) const {

5703

Info.flags = MachineMemOperand::MONone;

5704

Info.offset = 0;

5705

5706

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5707

if (!IntrData) {

5708

switch (Intrinsic) {

5709

case Intrinsic::x86_aesenc128kl:

5710

case Intrinsic::x86_aesdec128kl:

5711

Info.opc = ISD::INTRINSIC_W_CHAIN;

5712

Info.ptrVal = I.getArgOperand(1);

5713

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5714

Info.align = Align(1);

5715

Info.flags |= MachineMemOperand::MOLoad;

5716

return true;

5717

case Intrinsic::x86_aesenc256kl:

5718

case Intrinsic::x86_aesdec256kl:

5719

Info.opc = ISD::INTRINSIC_W_CHAIN;

5720

Info.ptrVal = I.getArgOperand(1);

5721

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5722

Info.align = Align(1);

5723

Info.flags |= MachineMemOperand::MOLoad;

5724

return true;

5725

case Intrinsic::x86_aesencwide128kl:

5726

case Intrinsic::x86_aesdecwide128kl:

5727

Info.opc = ISD::INTRINSIC_W_CHAIN;

5728

Info.ptrVal = I.getArgOperand(0);

5729

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5730

Info.align = Align(1);

5731

Info.flags |= MachineMemOperand::MOLoad;

5732

return true;

5733

case Intrinsic::x86_aesencwide256kl:

5734

case Intrinsic::x86_aesdecwide256kl:

5735

Info.opc = ISD::INTRINSIC_W_CHAIN;

5736

Info.ptrVal = I.getArgOperand(0);

5737

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5738

Info.align = Align(1);

5739

Info.flags |= MachineMemOperand::MOLoad;

5740

return true;

5741

case Intrinsic::x86_cmpccxadd32:

5742

case Intrinsic::x86_cmpccxadd64:

5743

case Intrinsic::x86_atomic_bts:

5744

case Intrinsic::x86_atomic_btc:

5745

case Intrinsic::x86_atomic_btr: {

5746

Info.opc = ISD::INTRINSIC_W_CHAIN;

5747

Info.ptrVal = I.getArgOperand(0);

5748

unsigned Size = I.getType()->getScalarSizeInBits();

5749

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5750

Info.align = Align(Size);

5751

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5752

MachineMemOperand::MOVolatile;

5753

return true;

5754

}

5755

case Intrinsic::x86_atomic_bts_rm:

5756

case Intrinsic::x86_atomic_btc_rm:

5757

case Intrinsic::x86_atomic_btr_rm: {

5758

Info.opc = ISD::INTRINSIC_W_CHAIN;

5759

Info.ptrVal = I.getArgOperand(0);

5760

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5761

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5762

Info.align = Align(Size);

5763

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5764

MachineMemOperand::MOVolatile;

5765

return true;

5766

}

5767

case Intrinsic::x86_aadd32:

5768

case Intrinsic::x86_aadd64:

5769

case Intrinsic::x86_aand32:

5770

case Intrinsic::x86_aand64:

5771

case Intrinsic::x86_aor32:

5772

case Intrinsic::x86_aor64:

5773

case Intrinsic::x86_axor32:

5774

case Intrinsic::x86_axor64:

5775

case Intrinsic::x86_atomic_add_cc:

5776

case Intrinsic::x86_atomic_sub_cc:

5777

case Intrinsic::x86_atomic_or_cc:

5778

case Intrinsic::x86_atomic_and_cc:

5779

case Intrinsic::x86_atomic_xor_cc: {

5780

Info.opc = ISD::INTRINSIC_W_CHAIN;

5781

Info.ptrVal = I.getArgOperand(0);

5782

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5783

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5784

Info.align = Align(Size);

5785

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5786

MachineMemOperand::MOVolatile;

5787

return true;

5788

}

5789

}

5790

return false;

5791

}

5792

5793

switch (IntrData->Type) {

5794

case TRUNCATE_TO_MEM_VI8:

5795

case TRUNCATE_TO_MEM_VI16:

5796

case TRUNCATE_TO_MEM_VI32: {

5797

Info.opc = ISD::INTRINSIC_VOID;

5798

Info.ptrVal = I.getArgOperand(0);

5799

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5800

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5801

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5802

ScalarVT = MVT::i8;

5803

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5804

ScalarVT = MVT::i16;

5805

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5806

ScalarVT = MVT::i32;

5807

5808

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5809

Info.align = Align(1);

5810

Info.flags |= MachineMemOperand::MOStore;

5811

break;

5812

}

5813

case GATHER:

5814

case GATHER_AVX2: {

5815

Info.opc = ISD::INTRINSIC_W_CHAIN;

5816

Info.ptrVal = nullptr;

5817

MVT DataVT = MVT::getVT(I.getType());

5818

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5819

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5820

IndexVT.getVectorNumElements());

5821

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5822

Info.align = Align(1);

5823

Info.flags |= MachineMemOperand::MOLoad;

5824

break;

5825

}

5826

case SCATTER: {

5827

Info.opc = ISD::INTRINSIC_VOID;

5828

Info.ptrVal = nullptr;

5829

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5830

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5831

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5832

IndexVT.getVectorNumElements());

5833

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5834

Info.align = Align(1);

5835

Info.flags |= MachineMemOperand::MOStore;

5836

break;

5837

}

5838

default:

5839

return false;

5840

}

5841

5842

return true;

5843

}

5844

5845

/// Returns true if the target can instruction select the

5846

/// specified FP immediate natively. If false, the legalizer will

5847

/// materialize the FP immediate as a load from a constant pool.

5848

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5849

bool ForCodeSize) const {

5850

for (const APFloat &FPImm : LegalFPImmediates)

5851

if (Imm.bitwiseIsEqual(FPImm))

5852

return true;

5853

return false;

5854

}

5855

5856

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5857

ISD::LoadExtType ExtTy,

5858

EVT NewVT) const {

5859

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5859, __extension__
__PRETTY_FUNCTION__));

5860

5861

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5862

// relocation target a movq or addq instruction: don't let the load shrink.

5863

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5864

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5865

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5866

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5867

5868

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5869

// those uses are extracted directly into a store, then the extract + store

5870

// can be store-folded. Therefore, it's probably not worth splitting the load.

5871

EVT VT = Load->getValueType(0);

5872

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5873

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5874

// Skip uses of the chain value. Result 0 of the node is the load value.

5875

if (UI.getUse().getResNo() != 0)

5876

continue;

5877

5878

// If this use is not an extract + store, it's probably worth splitting.

5879

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5880

UI->use_begin()->getOpcode() != ISD::STORE)

5881

return true;

5882

}

5883

// All non-chain uses are extract + store.

5884

return false;

5885

}

5886

5887

return true;

5888

}

5889

5890

/// Returns true if it is beneficial to convert a load of a constant

5891

/// to just the constant itself.

5892

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5893

Type *Ty) const {

5894

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5894, __extension__ __PRETTY_FUNCTION__));

5895

5896

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5897

if (BitSize == 0 || BitSize > 64)

5898

return false;

5899

return true;

5900

}

5901

5902

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5903

// If we are using XMM registers in the ABI and the condition of the select is

5904

// a floating-point compare and we have blendv or conditional move, then it is

5905

// cheaper to select instead of doing a cross-register move and creating a

5906

// load that depends on the compare result.

5907

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5908

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5909

}

5910

5911

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5912

// TODO: It might be a win to ease or lift this restriction, but the generic

5913

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5914

if (VT.isVector() && Subtarget.hasAVX512())

5915

return false;

5916

5917

return true;

5918

}

5919

5920

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5921

SDValue C) const {

5922

// TODO: We handle scalars using custom code, but generic combining could make

5923

// that unnecessary.

5924

APInt MulC;

5925

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5926

return false;

5927

5928

// Find the type this will be legalized too. Otherwise we might prematurely

5929

// convert this to shl+add/sub and then still have to type legalize those ops.

5930

// Another choice would be to defer the decision for illegal types until

5931

// after type legalization. But constant splat vectors of i64 can't make it

5932

// through type legalization on 32-bit targets so we would need to special

5933

// case vXi64.

5934

while (getTypeAction(Context, VT) != TypeLegal)

5935

VT = getTypeToTransformTo(Context, VT);

5936

5937

// If vector multiply is legal, assume that's faster than shl + add/sub.

5938

// Multiply is a complex op with higher latency and lower throughput in

5939

// most implementations, sub-vXi32 vector multiplies are always fast,

5940

// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

5941

// is always going to be slow.

5942

unsigned EltSizeInBits = VT.getScalarSizeInBits();

5943

if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

5944

(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

5945

return false;

5946

5947

// shl+add, shl+sub, shl+add+neg

5948

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5949

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5950

}

5951

5952

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5953

unsigned Index) const {

5954

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5955

return false;

5956

5957

// Mask vectors support all subregister combinations and operations that

5958

// extract half of vector.

5959

if (ResVT.getVectorElementType() == MVT::i1)

5960

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5961

(Index == ResVT.getVectorNumElements()));

5962

5963

return (Index % ResVT.getVectorNumElements()) == 0;

5964

}

5965

5966

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5967

unsigned Opc = VecOp.getOpcode();

5968

5969

// Assume target opcodes can't be scalarized.

5970

// TODO - do we have any exceptions?

5971

if (Opc >= ISD::BUILTIN_OP_END)

5972

return false;

5973

5974

// If the vector op is not supported, try to convert to scalar.

5975

EVT VecVT = VecOp.getValueType();

5976

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5977

return true;

5978

5979

// If the vector op is supported, but the scalar op is not, the transform may

5980

// not be worthwhile.

5981

EVT ScalarVT = VecVT.getScalarType();

5982

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5983

}

5984

5985

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5986

bool) const {

5987

// TODO: Allow vectors?

5988

if (VT.isVector())

5989

return false;

5990

return VT.isSimple() || !isOperationExpand(Opcode, VT);

5991

}

5992

5993

bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

5994

// Speculate cttz only if we can directly use TZCNT or can promote to i32.

5995

return Subtarget.hasBMI() ||

5996

(!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);

5997

}

5998

5999

bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

6000

// Speculate ctlz only if we can directly use LZCNT.

6001

return Subtarget.hasLZCNT();

6002

}

6003

6004

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

6005

// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

6006

// expensive than a straight movsd. On the other hand, it's important to

6007

// shrink long double fp constant since fldt is very slow.

6008

return !Subtarget.hasSSE2() || VT == MVT::f80;

6009

}

6010

6011

bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

6012

return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

6013

(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;

6014

}

6015

6016

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

6017

const SelectionDAG &DAG,

6018

const MachineMemOperand &MMO) const {

6019

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

6020

BitcastVT.getVectorElementType() == MVT::i1)

6021

return false;

6022

6023

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

6024

return false;

6025

6026

// If both types are legal vectors, it's always ok to convert them.

6027

if (LoadVT.isVector() && BitcastVT.isVector() &&

6028

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

6029

return true;

6030

6031

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

6032

}

6033

6034

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

6035

const MachineFunction &MF) const {

6036

// Do not merge to float value size (128 bytes) if no implicit

6037

// float attribute is set.

6038

bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

6039

6040

if (NoFloat) {

6041

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

6042

return (MemVT.getSizeInBits() <= MaxIntSize);

6043

}

6044

// Make sure we don't merge greater than our preferred vector

6045

// width.

6046

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

6047

return false;

6048

6049

return true;

6050

}

6051

6052

bool X86TargetLowering::isCtlzFast() const {

6053

return Subtarget.hasFastLZCNT();

6054

}

6055

6056

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

6057

const Instruction &AndI) const {

6058

return true;

6059

}

6060

6061

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

6062

EVT VT = Y.getValueType();

6063

6064

if (VT.isVector())

6065

return false;

6066

6067

if (!Subtarget.hasBMI())

6068

return false;

6069

6070

// There are only 32-bit and 64-bit forms for 'andn'.

6071

if (VT != MVT::i32 && VT != MVT::i64)

6072

return false;

6073

6074

return !isa<ConstantSDNode>(Y);

6075

}

6076

6077

bool X86TargetLowering::hasAndNot(SDValue Y) const {

6078

EVT VT = Y.getValueType();

6079

6080

if (!VT.isVector())

6081

return hasAndNotCompare(Y);

6082

6083

// Vector.

6084

6085

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

6086

return false;

6087

6088

if (VT == MVT::v4i32)

6089

return true;

6090

6091

return Subtarget.hasSSE2();

6092

}

6093

6094

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

6095

return X.getValueType().isScalarInteger(); // 'bt'

6096

}

6097

6098

bool X86TargetLowering::

6099

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6100

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

6101

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

6102

SelectionDAG &DAG) const {

6103

// Does baseline recommend not to perform the fold by default?

6104

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6105

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

6106

return false;

6107

// For scalars this transform is always beneficial.

6108

if (X.getValueType().isScalarInteger())

6109

return true;

6110

// If all the shift amounts are identical, then transform is beneficial even

6111

// with rudimentary SSE2 shifts.

6112

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

6113

return true;

6114

// If we have AVX2 with it's powerful shift operations, then it's also good.

6115

if (Subtarget.hasAVX2())

6116

return true;

6117

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

6118

return NewShiftOpcode == ISD::SHL;

6119

}

6120

6121

bool X86TargetLowering::preferScalarizeSplat(unsigned Opc) const {

6122

return Opc != ISD::FP_EXTEND;

6123

}

6124

6125

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

6126

const SDNode *N, CombineLevel Level) const {

6127

assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6131, __extension__
__PRETTY_FUNCTION__))

6128

N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6131, __extension__
__PRETTY_FUNCTION__))

6129

(N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6131, __extension__
__PRETTY_FUNCTION__))

6130

N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6131, __extension__
__PRETTY_FUNCTION__))

6131

"Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6131, __extension__
__PRETTY_FUNCTION__));

6132

// TODO: Should we always create i64 masks? Or only folded immediates?

6133

EVT VT = N->getValueType(0);

6134

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

6135

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

6136

// Only fold if the shift values are equal - so it folds to AND.

6137

// TODO - we should fold if either is a non-uniform vector but we don't do

6138

// the fold for non-splats yet.

6139

return N->getOperand(1) == N->getOperand(0).getOperand(1);

6140

}

6141

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

6142

}

6143

6144

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

6145

EVT VT = Y.getValueType();

6146

6147

// For vectors, we don't have a preference, but we probably want a mask.

6148

if (VT.isVector())

6149

return false;

6150

6151

// 64-bit shifts on 32-bit targets produce really bad bloated code.

6152

if (VT == MVT::i64 && !Subtarget.is64Bit())

6153

return false;

6154

6155

return true;

6156

}

6157

6158

TargetLowering::ShiftLegalizationStrategy

6159

X86TargetLowering::preferredShiftLegalizationStrategy(

6160

SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {

6161

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

6162

!Subtarget.isOSWindows())

6163

return ShiftLegalizationStrategy::LowerToLibcall;

6164

return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

6165

ExpansionFactor);

6166

}

6167

6168

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

6169

// Any legal vector type can be splatted more efficiently than

6170

// loading/spilling from memory.

6171

return isTypeLegal(VT);

6172

}

6173

6174

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

6175

MVT VT = MVT::getIntegerVT(NumBits);

6176

if (isTypeLegal(VT))

6177

return VT;

6178

6179

// PMOVMSKB can handle this.

6180

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

6181

return MVT::v16i8;

6182

6183

// VPMOVMSKB can handle this.

6184

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

6185

return MVT::v32i8;

6186

6187

// TODO: Allow 64-bit type for 32-bit target.

6188

// TODO: 512-bit types should be allowed, but make sure that those

6189

// cases are handled in combineVectorSizedSetCCEquality().

6190

6191

return MVT::INVALID_SIMPLE_VALUE_TYPE;

6192

}

6193

6194

/// Val is the undef sentinel value or equal to the specified value.

6195

static bool isUndefOrEqual(int Val, int CmpVal) {

6196

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

6197

}

6198

6199

/// Return true if every element in Mask is the undef sentinel value or equal to

6200

/// the specified value..

6201

static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

6202

return llvm::all_of(Mask, [CmpVal](int M) {

6203

return (M == SM_SentinelUndef) || (M == CmpVal);

6204

});

6205

}

6206

6207

/// Val is either the undef or zero sentinel value.

6208

static bool isUndefOrZero(int Val) {

6209

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

6210

}

6211

6212

/// Return true if every element in Mask, beginning from position Pos and ending

6213

/// in Pos+Size is the undef sentinel value.

6214

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

6215

return llvm::all_of(Mask.slice(Pos, Size),

6216

[](int M) { return M == SM_SentinelUndef; });

6217

}

6218

6219

/// Return true if the mask creates a vector whose lower half is undefined.

6220

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

6221

unsigned NumElts = Mask.size();

6222

return isUndefInRange(Mask, 0, NumElts / 2);

6223

}

6224

6225

/// Return true if the mask creates a vector whose upper half is undefined.

6226

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

6227

unsigned NumElts = Mask.size();

6228

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

6229

}

6230

6231

/// Return true if Val falls within the specified range (L, H].

6232

static bool isInRange(int Val, int Low, int Hi) {

6233

return (Val >= Low && Val < Hi);

6234

}

6235

6236

/// Return true if the value of any element in Mask falls within the specified

6237

/// range (L, H].

6238

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

6239

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

6240

}

6241

6242

/// Return true if the value of any element in Mask is the zero sentinel value.

6243

static bool isAnyZero(ArrayRef<int> Mask) {

6244

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

6245

}

6246

6247

/// Return true if the value of any element in Mask is the zero or undef

6248

/// sentinel values.

6249

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

6250

return llvm::any_of(Mask, [](int M) {

6251

return M == SM_SentinelZero || M == SM_SentinelUndef;

6252

});

6253

}

6254

6255

/// Return true if Val is undef or if its value falls within the

6256

/// specified range (L, H].

6257

static bool isUndefOrInRange(int Val, int Low, int Hi) {

6258

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

6259

}

6260

6261

/// Return true if every element in Mask is undef or if its value

6262

/// falls within the specified range (L, H].

6263

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6264

return llvm::all_of(

6265

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

6266

}

6267

6268

/// Return true if Val is undef, zero or if its value falls within the

6269

/// specified range (L, H].

6270

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

6271

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

6272

}

6273

6274

/// Return true if every element in Mask is undef, zero or if its value

6275

/// falls within the specified range (L, H].

6276

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6277

return llvm::all_of(

6278

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

6279

}

6280

6281

/// Return true if every element in Mask, beginning

6282

/// from position Pos and ending in Pos + Size, falls within the specified

6283

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

6284

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

6285

unsigned Size, int Low, int Step = 1) {

6286

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6287

if (!isUndefOrEqual(Mask[i], Low))

6288

return false;

6289

return true;

6290

}

6291

6292

/// Return true if every element in Mask, beginning

6293

/// from position Pos and ending in Pos+Size, falls within the specified

6294

/// sequential range (Low, Low+Size], or is undef or is zero.

6295

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6296

unsigned Size, int Low,

6297

int Step = 1) {

6298

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6299

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

6300

return false;

6301

return true;

6302

}

6303

6304

/// Return true if every element in Mask, beginning

6305

/// from position Pos and ending in Pos+Size is undef or is zero.

6306

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6307

unsigned Size) {

6308

return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

6309

}

6310

6311

/// Helper function to test whether a shuffle mask could be

6312

/// simplified by widening the elements being shuffled.

6313

///

6314

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

6315

/// leaves it in an unspecified state.

6316

///

6317

/// NOTE: This must handle normal vector shuffle masks and *target* vector

6318

/// shuffle masks. The latter have the special property of a '-2' representing

6319

/// a zero-ed lane of a vector.

6320

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6321

SmallVectorImpl<int> &WidenedMask) {

6322

WidenedMask.assign(Mask.size() / 2, 0);

6323

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

6324

int M0 = Mask[i];

6325

int M1 = Mask[i + 1];

6326

6327

// If both elements are undef, its trivial.

6328

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

6329

WidenedMask[i / 2] = SM_SentinelUndef;

6330

continue;

6331

}

6332

6333

// Check for an undef mask and a mask value properly aligned to fit with

6334

// a pair of values. If we find such a case, use the non-undef mask's value.

6335

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

6336

WidenedMask[i / 2] = M1 / 2;

6337

continue;

6338

}

6339

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

6340

WidenedMask[i / 2] = M0 / 2;

6341

continue;

6342

}

6343

6344

// When zeroing, we need to spread the zeroing across both lanes to widen.

6345

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

6346

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

6347

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

6348

WidenedMask[i / 2] = SM_SentinelZero;

6349

continue;

6350

}

6351

return false;

6352

}

6353

6354

// Finally check if the two mask values are adjacent and aligned with

6355

// a pair.

6356

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

6357

WidenedMask[i / 2] = M0 / 2;

6358

continue;

6359

}

6360

6361

// Otherwise we can't safely widen the elements used in this shuffle.

6362

return false;

6363

}

6364

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6365, __extension__
__PRETTY_FUNCTION__))

6365

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6365, __extension__
__PRETTY_FUNCTION__));

6366

6367

return true;

6368

}

6369

6370

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6371

const APInt &Zeroable,

6372

bool V2IsZero,

6373

SmallVectorImpl<int> &WidenedMask) {

6374

// Create an alternative mask with info about zeroable elements.

6375

// Here we do not set undef elements as zeroable.

6376

SmallVector<int, 64> ZeroableMask(Mask);

6377

if (V2IsZero) {

6378

assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6378, __extension__
__PRETTY_FUNCTION__));

6379

for (int i = 0, Size = Mask.size(); i != Size; ++i)

6380

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

6381

ZeroableMask[i] = SM_SentinelZero;

6382

}

6383

return canWidenShuffleElements(ZeroableMask, WidenedMask);

6384

}

6385

6386

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

6387

SmallVector<int, 32> WidenedMask;

6388

return canWidenShuffleElements(Mask, WidenedMask);

6389

}

6390

6391

// Attempt to narrow/widen shuffle mask until it matches the target number of

6392

// elements.

6393

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

6394

SmallVectorImpl<int> &ScaledMask) {

6395

unsigned NumSrcElts = Mask.size();

6396

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6397, __extension__
__PRETTY_FUNCTION__))

6397

"Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6397, __extension__
__PRETTY_FUNCTION__));

6398

6399

// Narrowing is guaranteed to work.

6400

if (NumDstElts >= NumSrcElts) {

6401

int Scale = NumDstElts / NumSrcElts;

6402

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

6403

return true;

6404

}

6405

6406

// We have to repeat the widening until we reach the target size, but we can

6407

// split out the first widening as it sets up ScaledMask for us.

6408

if (canWidenShuffleElements(Mask, ScaledMask)) {

6409

while (ScaledMask.size() > NumDstElts) {

6410

SmallVector<int, 16> WidenedMask;

6411

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

6412

return false;

6413

ScaledMask = std::move(WidenedMask);

6414

}

6415

return true;

6416

}

6417

6418

return false;

6419

}

6420

6421

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

6422

bool X86::isZeroNode(SDValue Elt) {

6423

return isNullConstant(Elt) || isNullFPConstant(Elt);

6424

}

6425

6426

// Build a vector of constants.

6427

// Use an UNDEF node if MaskElt == -1.

6428

// Split 64-bit constants in the 32-bit mode.

6429

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

6430

const SDLoc &dl, bool IsMask = false) {

6431

6432

SmallVector<SDValue, 32> Ops;

6433

bool Split = false;

6434

6435

MVT ConstVecVT = VT;

6436

unsigned NumElts = VT.getVectorNumElements();

6437

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6438

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6439

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6440

Split = true;

6441

}

6442

6443

MVT EltVT = ConstVecVT.getVectorElementType();

6444

for (unsigned i = 0; i < NumElts; ++i) {

6445

bool IsUndef = Values[i] < 0 && IsMask;

6446

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

6447

DAG.getConstant(Values[i], dl, EltVT);

6448

Ops.push_back(OpNode);

6449

if (Split)

6450

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

6451

DAG.getConstant(0, dl, EltVT));

6452

}

6453

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6454

if (Split)

6455

ConstsNode = DAG.getBitcast(VT, ConstsNode);

6456

return ConstsNode;

6457

}

6458

6459

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

6460

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6461

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6462, __extension__
__PRETTY_FUNCTION__))

6462

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6462, __extension__
__PRETTY_FUNCTION__));

6463

SmallVector<SDValue, 32> Ops;

6464

bool Split = false;

6465

6466

MVT ConstVecVT = VT;

6467

unsigned NumElts = VT.getVectorNumElements();

6468

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6469

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6470

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6471

Split = true;

6472

}

6473

6474

MVT EltVT = ConstVecVT.getVectorElementType();

6475

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

6476

if (Undefs[i]) {

6477

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

6478

continue;

6479

}

6480

const APInt &V = Bits[i];

6481

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6481, __extension__
__PRETTY_FUNCTION__));

6482

if (Split) {

6483

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

6484

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

6485

} else if (EltVT == MVT::f32) {

6486

APFloat FV(APFloat::IEEEsingle(), V);

6487

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6488

} else if (EltVT == MVT::f64) {

6489

APFloat FV(APFloat::IEEEdouble(), V);

6490

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6491

} else {

6492

Ops.push_back(DAG.getConstant(V, dl, EltVT));

6493

}

6494

}

6495

6496

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6497

return DAG.getBitcast(VT, ConstsNode);

6498

}

6499

6500

/// Returns a vector of specified type with all zero elements.

6501

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

6502

SelectionDAG &DAG, const SDLoc &dl) {

6503

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6505, __extension__
__PRETTY_FUNCTION__))

6504

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6505, __extension__
__PRETTY_FUNCTION__))

6505

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6505, __extension__
__PRETTY_FUNCTION__));

6506

6507

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

6508

// type. This ensures they get CSE'd. But if the integer type is not

6509

// available, use a floating-point +0.0 instead.

6510

SDValue Vec;

6511

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

6512

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

6513

} else if (VT.isFloatingPoint()) {

6514

Vec = DAG.getConstantFP(+0.0, dl, VT);

6515

} else if (VT.getVectorElementType() == MVT::i1) {

6516

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6517, __extension__
__PRETTY_FUNCTION__))

6517

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6517, __extension__
__PRETTY_FUNCTION__));

6518

Vec = DAG.getConstant(0, dl, VT);

6519

} else {

6520

unsigned Num32BitElts = VT.getSizeInBits() / 32;

6521

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

6522

}

6523

return DAG.getBitcast(VT, Vec);

6524

}

6525

6526

// Helper to determine if the ops are all the extracted subvectors come from a

6527

// single source. If we allow commute they don't have to be in order (Lo/Hi).

6528

static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

6529

if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6530

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6531

LHS.getValueType() != RHS.getValueType() ||

6532

LHS.getOperand(0) != RHS.getOperand(0))

6533

return SDValue();

6534

6535

SDValue Src = LHS.getOperand(0);

6536

if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

6537

return SDValue();

6538

6539

unsigned NumElts = LHS.getValueType().getVectorNumElements();

6540

if ((LHS.getConstantOperandAPInt(1) == 0 &&

6541

RHS.getConstantOperandAPInt(1) == NumElts) ||

6542

(AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

6543

LHS.getConstantOperandAPInt(1) == NumElts))

6544

return Src;

6545

6546

return SDValue();

6547

}

6548

6549

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

6550

const SDLoc &dl, unsigned vectorWidth) {

6551

EVT VT = Vec.getValueType();

6552

EVT ElVT = VT.getVectorElementType();

6553

unsigned Factor = VT.getSizeInBits() / vectorWidth;

6554

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

6555

VT.getVectorNumElements() / Factor);

6556

6557

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

6558

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

6559

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6559, __extension__
__PRETTY_FUNCTION__));

6560

6561

// This is the index of the first element of the vectorWidth-bit chunk

6562

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6563

IdxVal &= ~(ElemsPerChunk - 1);

6564

6565

// If the input is a buildvector just emit a smaller one.

6566

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

6567

return DAG.getBuildVector(ResultVT, dl,

6568

Vec->ops().slice(IdxVal, ElemsPerChunk));

6569

6570

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6571

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

6572

}

6573

6574

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

6575

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

6576

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

6577

/// instructions or a simple subregister reference. Idx is an index in the

6578

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

6579

/// lowering EXTRACT_VECTOR_ELT operations easier.

6580

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

6581

SelectionDAG &DAG, const SDLoc &dl) {

6582

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6583, __extension__
__PRETTY_FUNCTION__))

6583

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6583, __extension__
__PRETTY_FUNCTION__));

6584

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

6585

}

6586

6587

/// Generate a DAG to grab 256-bits from a 512-bit vector.

6588

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

6589

SelectionDAG &DAG, const SDLoc &dl) {

6590

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6590, __extension__
__PRETTY_FUNCTION__));

6591

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

6592

}

6593

6594

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6595

SelectionDAG &DAG, const SDLoc &dl,

6596

unsigned vectorWidth) {

6597

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6598, __extension__
__PRETTY_FUNCTION__))

6598

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6598, __extension__
__PRETTY_FUNCTION__));

6599

// Inserting UNDEF is Result

6600

if (Vec.isUndef())

6601

return Result;

6602

EVT VT = Vec.getValueType();

6603

EVT ElVT = VT.getVectorElementType();

6604

EVT ResultVT = Result.getValueType();

6605

6606

// Insert the relevant vectorWidth bits.

6607

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

6608

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6608, __extension__
__PRETTY_FUNCTION__));

6609

6610

// This is the index of the first element of the vectorWidth-bit chunk

6611

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6612

IdxVal &= ~(ElemsPerChunk - 1);

6613

6614

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6615

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

6616

}

6617

6618

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

6619

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

6620

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

6621

/// simple superregister reference. Idx is an index in the 128 bits

6622

/// we want. It need not be aligned to a 128-bit boundary. That makes

6623

/// lowering INSERT_VECTOR_ELT operations easier.

6624

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6625

SelectionDAG &DAG, const SDLoc &dl) {

6626

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6626, __extension__
__PRETTY_FUNCTION__));

6627

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

6628

}

6629

6630

/// Widen a vector to a larger size with the same scalar type, with the new

6631

/// elements either zero or undef.

6632

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

6633

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6634

const SDLoc &dl) {

6635

assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6637, __extension__
__PRETTY_FUNCTION__))

6636

Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6637, __extension__
__PRETTY_FUNCTION__))

6637

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6637, __extension__
__PRETTY_FUNCTION__));

6638

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

6639

: DAG.getUNDEF(VT);

6640

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

6641

DAG.getIntPtrConstant(0, dl));

6642

}

6643

6644

/// Widen a vector to a larger size with the same scalar type, with the new

6645

/// elements either zero or undef.

6646

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

6647

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6648

const SDLoc &dl, unsigned WideSizeInBits) {

6649

assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6651, __extension__
__PRETTY_FUNCTION__))

6650

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6651, __extension__
__PRETTY_FUNCTION__))

6651

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6651, __extension__
__PRETTY_FUNCTION__));

6652

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

6653

MVT SVT = Vec.getSimpleValueType().getScalarType();

6654

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

6655

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

6656

}

6657

6658

// Helper function to collect subvector ops that are concatenated together,

6659

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

6660

// The subvectors in Ops are guaranteed to be the same type.

6661

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,

6662

SelectionDAG &DAG) {

6663

assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6663, __extension__
__PRETTY_FUNCTION__));

6664

6665

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

6666

Ops.append(N->op_begin(), N->op_end());

6667

return true;

6668

}

6669

6670

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

6671

SDValue Src = N->getOperand(0);

6672

SDValue Sub = N->getOperand(1);

6673

const APInt &Idx = N->getConstantOperandAPInt(2);

6674

EVT VT = Src.getValueType();

6675

EVT SubVT = Sub.getValueType();

6676

6677

// TODO - Handle more general insert_subvector chains.

6678

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {

6679

// insert_subvector(undef, x, lo)

6680

if (Idx == 0 && Src.isUndef()) {

6681

Ops.push_back(Sub);

6682

Ops.push_back(DAG.getUNDEF(SubVT));

6683

return true;

6684

}

6685

if (Idx == (VT.getVectorNumElements() / 2)) {

6686

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

6687

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

6688

Src.getOperand(1).getValueType() == SubVT &&

6689

isNullConstant(Src.getOperand(2))) {

6690

Ops.push_back(Src.getOperand(1));

6691

Ops.push_back(Sub);

6692

return true;

6693

}

6694

// insert_subvector(x, extract_subvector(x, lo), hi)

6695

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6696

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

6697

Ops.append(2, Sub);

6698

return true;

6699

}

6700

// insert_subvector(undef, x, hi)

6701

if (Src.isUndef()) {

6702

Ops.push_back(DAG.getUNDEF(SubVT));

6703

Ops.push_back(Sub);

6704

return true;

6705

}

6706

}

6707

}

6708

}

6709

6710

return false;

6711

}

6712

6713

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

6714

const SDLoc &dl) {

6715

EVT VT = Op.getValueType();

6716

unsigned NumElems = VT.getVectorNumElements();

6717

unsigned SizeInBits = VT.getSizeInBits();

6718

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6719, __extension__
__PRETTY_FUNCTION__))

6719

"Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6719, __extension__
__PRETTY_FUNCTION__));

6720

6721

// If this is a splat value (with no-undefs) then use the lower subvector,

6722

// which should be a free extraction.

6723

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

6724

if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

6725

return std::make_pair(Lo, Lo);

6726

6727

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

6728

return std::make_pair(Lo, Hi);

6729

}

6730

6731

/// Break an operation into 2 half sized ops and then concatenate the results.

6732

static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {

6733

unsigned NumOps = Op.getNumOperands();

6734

EVT VT = Op.getValueType();

6735

SDLoc dl(Op);

6736

6737

// Extract the LHS Lo/Hi vectors

6738

SmallVector<SDValue> LoOps(NumOps, SDValue());

6739

SmallVector<SDValue> HiOps(NumOps, SDValue());

6740

for (unsigned I = 0; I != NumOps; ++I) {

6741

SDValue SrcOp = Op.getOperand(I);

6742

if (!SrcOp.getValueType().isVector()) {

6743

LoOps[I] = HiOps[I] = SrcOp;

6744

continue;

6745

}

6746

std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

6747

}

6748

6749

EVT LoVT, HiVT;

6750

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

6751

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

6752

DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

6753

DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

6754

}

6755

6756

/// Break an unary integer operation into 2 half sized ops and then

6757

/// concatenate the result back.

6758

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

6759

// Make sure we only try to split 256/512-bit types to avoid creating

6760

// narrow vectors.

6761

EVT VT = Op.getValueType();

6762

(void)VT;

6763

assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))

6764

Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))

6765

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__));

6766

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6768, __extension__
__PRETTY_FUNCTION__))

6767

VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6768, __extension__
__PRETTY_FUNCTION__))

6768

"Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6768, __extension__
__PRETTY_FUNCTION__));

6769

return splitVectorOp(Op, DAG);

6770

}

6771

6772

/// Break a binary integer operation into 2 half sized ops and then

6773

/// concatenate the result back.

6774

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

6775

// Assert that all the types match.

6776

EVT VT = Op.getValueType();

6777

(void)VT;

6778

assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6779, __extension__
__PRETTY_FUNCTION__))

6779

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6779, __extension__
__PRETTY_FUNCTION__));

6780

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6780, __extension__
__PRETTY_FUNCTION__));

6781

return splitVectorOp(Op, DAG);

6782

}

6783

6784

// Helper for splitting operands of an operation to legal target size and

6785

// apply a function on each part.

6786

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

6787

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

6788

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

6789

// The argument Builder is a function that will be applied on each split part:

6790

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

6791

template <typename F>

6792

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6793

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

6794

F Builder, bool CheckBWI = true) {

6795

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6795, __extension__
__PRETTY_FUNCTION__));

6796

unsigned NumSubs = 1;

6797

if ((CheckBWI && Subtarget.useBWIRegs()) ||

6798

(!CheckBWI && Subtarget.useAVX512Regs())) {

6799

if (VT.getSizeInBits() > 512) {

6800

NumSubs = VT.getSizeInBits() / 512;

6801

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6801, __extension__
__PRETTY_FUNCTION__));

6802

}

6803

} else if (Subtarget.hasAVX2()) {

6804

if (VT.getSizeInBits() > 256) {

6805

NumSubs = VT.getSizeInBits() / 256;

6806

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6806, __extension__
__PRETTY_FUNCTION__));

6807

}

6808

} else {

6809

if (VT.getSizeInBits() > 128) {

6810

NumSubs = VT.getSizeInBits() / 128;

6811

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6811, __extension__
__PRETTY_FUNCTION__));

6812

}

6813

}

6814

6815

if (NumSubs == 1)

6816

return Builder(DAG, DL, Ops);

6817

6818

SmallVector<SDValue, 4> Subs;

6819

for (unsigned i = 0; i != NumSubs; ++i) {

6820

SmallVector<SDValue, 2> SubOps;

6821

for (SDValue Op : Ops) {

6822

EVT OpVT = Op.getValueType();

6823

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

6824

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

6825

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

6826

}

6827

Subs.push_back(Builder(DAG, DL, SubOps));

6828

}

6829

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

6830

}

6831

6832

// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

6833

// targets.

6834

static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

6835

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

6836

const X86Subtarget &Subtarget) {

6837

assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6837, __extension__
__PRETTY_FUNCTION__));

6838

MVT SVT = VT.getScalarType();

6839

6840

// If we have a 32/64 splatted constant, splat it to DstTy to

6841

// encourage a foldable broadcast'd operand.

6842

auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

6843

unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

6844

// AVX512 broadcasts 32/64-bit operands.

6845

// TODO: Support float once getAVX512Node is used by fp-ops.

6846

if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

6847

!DAG.getTargetLoweringInfo().isTypeLegal(SVT))

6848

return SDValue();

6849

// If we're not widening, don't bother if we're not bitcasting.

6850

if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

6851

return SDValue();

6852

if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

6853

APInt SplatValue, SplatUndef;

6854

unsigned SplatBitSize;

6855

bool HasAnyUndefs;

6856

if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

6857

HasAnyUndefs, OpEltSizeInBits) &&

6858

!HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

6859

return DAG.getConstant(SplatValue, DL, DstVT);

6860

}

6861

return SDValue();

6862

};

6863

6864

bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());

6865

6866

MVT DstVT = VT;

6867

if (Widen)

6868

DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());

6869

6870

// Canonicalize src operands.

6871

SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());

6872

for (SDValue &Op : SrcOps) {

6873

MVT OpVT = Op.getSimpleValueType();

6874

// Just pass through scalar operands.

6875

if (!OpVT.isVector())

6876

continue;

6877

assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6877, __extension__
__PRETTY_FUNCTION__));

6878

6879

if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

6880

Op = BroadcastOp;

6881

continue;

6882

}

6883

6884

// Just widen the subvector by inserting into an undef wide vector.

6885

if (Widen)

6886

Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

6887

}

6888

6889

SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);

6890

6891

// Perform the 512-bit op then extract the bottom subvector.

6892

if (Widen)

6893

Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

6894

return Res;

6895

}

6896

6897

/// Insert i1-subvector to i1-vector.

6898

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6899

const X86Subtarget &Subtarget) {

6900

6901

SDLoc dl(Op);

6902

SDValue Vec = Op.getOperand(0);

6903

SDValue SubVec = Op.getOperand(1);

6904

SDValue Idx = Op.getOperand(2);

6905

unsigned IdxVal = Op.getConstantOperandVal(2);

6906

6907

// Inserting undef is a nop. We can just return the original vector.

6908

if (SubVec.isUndef())

6909

return Vec;

6910

6911

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6912

return Op;

6913

6914

MVT OpVT = Op.getSimpleValueType();

6915

unsigned NumElems = OpVT.getVectorNumElements();

6916

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6917

6918

// Extend to natively supported kshift.

6919

MVT WideOpVT = OpVT;

6920

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6921

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6922

6923

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6924

// if necessary.

6925

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6926

// May need to promote to a legal type.

6927

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6928

DAG.getConstant(0, dl, WideOpVT),

6929

SubVec, Idx);

6930

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6931

}

6932

6933

MVT SubVecVT = SubVec.getSimpleValueType();

6934

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6935

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6937, __extension__
__PRETTY_FUNCTION__))

6936

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6937, __extension__
__PRETTY_FUNCTION__))

6937

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6937, __extension__
__PRETTY_FUNCTION__));

6938

6939

SDValue Undef = DAG.getUNDEF(WideOpVT);

6940

6941

if (IdxVal == 0) {

6942

// Zero lower bits of the Vec

6943

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6944

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6945

ZeroIdx);

6946

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6947

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6948

// Merge them together, SubVec should be zero extended.

6949

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6950

DAG.getConstant(0, dl, WideOpVT),

6951

SubVec, ZeroIdx);

6952

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6953

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6954

}

6955

6956

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6957

Undef, SubVec, ZeroIdx);

6958

6959

if (Vec.isUndef()) {

6960

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6960, __extension__
__PRETTY_FUNCTION__));

6961

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6962

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6963

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6964

}

6965

6966

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6967

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6967, __extension__
__PRETTY_FUNCTION__));

6968

// If upper elements of Vec are known undef, then just shift into place.

6969

if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

6970

[](SDValue V) { return V.isUndef(); })) {

6971

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6972

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6973

} else {

6974

NumElems = WideOpVT.getVectorNumElements();

6975

unsigned ShiftLeft = NumElems - SubVecNumElems;

6976

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6977

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6978

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6979

if (ShiftRight != 0)

6980

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6981

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6982

}

6983

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6984

}

6985

6986

// Simple case when we put subvector in the upper part

6987

if (IdxVal + SubVecNumElems == NumElems) {

6988

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6989

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6990

if (SubVecNumElems * 2 == NumElems) {

6991

// Special case, use legal zero extending insert_subvector. This allows

6992

// isel to optimize when bits are known zero.

6993

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

6994

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6995

DAG.getConstant(0, dl, WideOpVT),

6996

Vec, ZeroIdx);

6997

} else {

6998

// Otherwise use explicit shifts to zero the bits.

6999

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

7000

Undef, Vec, ZeroIdx);

7001

NumElems = WideOpVT.getVectorNumElements();

7002

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

7003

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

7004

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

7005

}

7006

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7007

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7008

}

7009

7010

// Inserting into the middle is more complicated.

7011

7012

NumElems = WideOpVT.getVectorNumElements();

7013

7014

// Widen the vector if needed.

7015

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

7016

7017

unsigned ShiftLeft = NumElems - SubVecNumElems;

7018

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

7019

7020

// Do an optimization for the the most frequently used types.

7021

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

7022

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

7023

Mask0.flipAllBits();

7024

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

7025

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

7026

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

7027

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7028

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7029

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7030

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7031

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7032

7033

// Reduce to original width if needed.

7034

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7035

}

7036

7037

// Clear the upper bits of the subvector and move it to its insert position.

7038

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7039

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7040

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7041

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7042

7043

// Isolate the bits below the insertion point.

7044

unsigned LowShift = NumElems - IdxVal;

7045

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

7046

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7047

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

7048

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7049

7050

// Isolate the bits after the last inserted bit.

7051

unsigned HighShift = IdxVal + SubVecNumElems;

7052

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

7053

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7054

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

7055

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7056

7057

// Now OR all 3 pieces together.

7058

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

7059

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

7060

7061

// Reduce to original width if needed.

7062

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

7063

}

7064

7065

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

7066

const SDLoc &dl) {

7067

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7067, __extension__
__PRETTY_FUNCTION__));

7068

EVT SubVT = V1.getValueType();

7069

EVT SubSVT = SubVT.getScalarType();

7070

unsigned SubNumElts = SubVT.getVectorNumElements();

7071

unsigned SubVectorWidth = SubVT.getSizeInBits();

7072

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

7073

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

7074

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

7075

}

7076

7077

/// Returns a vector of specified type with all bits set.

7078

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

7079

/// Then bitcast to their original type, ensuring they get CSE'd.

7080

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

7081

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7082, __extension__
__PRETTY_FUNCTION__))

7082

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7082, __extension__
__PRETTY_FUNCTION__));

7083

7084

APInt Ones = APInt::getAllOnes(32);

7085

unsigned NumElts = VT.getSizeInBits() / 32;

7086

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

7087

return DAG.getBitcast(VT, Vec);

7088

}

7089

7090

static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

7091

SDValue In, SelectionDAG &DAG) {

7092

EVT InVT = In.getValueType();

7093

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__));

7094

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7096, __extension__
__PRETTY_FUNCTION__))

7095

ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7096, __extension__
__PRETTY_FUNCTION__))

7096

"Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7096, __extension__
__PRETTY_FUNCTION__));

7097

7098

// For 256-bit vectors, we only need the lower (128-bit) input half.

7099

// For 512-bit vectors, we only need the lower input half or quarter.

7100

if (InVT.getSizeInBits() > 128) {

7101

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7102, __extension__
__PRETTY_FUNCTION__))

7102

"Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7102, __extension__
__PRETTY_FUNCTION__));

7103

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

7104

In = extractSubVector(In, 0, DAG, DL,

7105

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

7106

InVT = In.getValueType();

7107

}

7108

7109

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

7110

Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);

7111

7112

return DAG.getNode(Opcode, DL, VT, In);

7113

}

7114

7115

// Match (xor X, -1) -> X.

7116

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

7117

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

7118

static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

7119

V = peekThroughBitcasts(V);

7120

if (V.getOpcode() == ISD::XOR &&

7121

(ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||

7122

isAllOnesConstant(V.getOperand(1))))

7123

return V.getOperand(0);

7124

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

7125

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

7126

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

7127

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

7128

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

7129

Not, V.getOperand(1));

7130

}

7131

}

7132

SmallVector<SDValue, 2> CatOps;

7133

if (collectConcatOps(V.getNode(), CatOps, DAG)) {

7134

for (SDValue &CatOp : CatOps) {

7135

SDValue NotCat = IsNOT(CatOp, DAG);

7136

if (!NotCat) return SDValue();

7137

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

7138

}

7139

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

7140

}

7141

return SDValue();

7142

}

7143

7144

void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

7145

bool Lo, bool Unary) {

7146

assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7147, __extension__
__PRETTY_FUNCTION__))

7147

"Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7147, __extension__
__PRETTY_FUNCTION__));

7148

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7148, __extension__
__PRETTY_FUNCTION__));

7149

int NumElts = VT.getVectorNumElements();

7150

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

7151

for (int i = 0; i < NumElts; ++i) {

7152

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

7153

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

7154

Pos += (Unary ? 0 : NumElts * (i % 2));

7155

Pos += (Lo ? 0 : NumEltsInLane / 2);

7156

Mask.push_back(Pos);

7157

}

7158

}

7159

7160

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

7161

/// imposed by AVX and specific to the unary pattern. Example:

7162

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

7163

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

7164

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7165

bool Lo) {

7166

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7166, __extension__
__PRETTY_FUNCTION__));

7167

int NumElts = VT.getVectorNumElements();

7168

for (int i = 0; i < NumElts; ++i) {

7169

int Pos = i / 2;

7170

Pos += (Lo ? 0 : NumElts / 2);

7171

Mask.push_back(Pos);

7172

}

7173

}

7174

7175

// Attempt to constant fold, else just create a VECTOR_SHUFFLE.

7176

static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

7177

SDValue V1, SDValue V2, ArrayRef<int> Mask) {

7178

if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

7179

(ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

7180

SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

7181

for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

7182

int M = Mask[I];

7183

if (M < 0)

7184

continue;

7185

SDValue V = (M < NumElts) ? V1 : V2;

7186

if (V.isUndef())

7187

continue;

7188

Ops[I] = V.getOperand(M % NumElts);

7189

}

7190

return DAG.getBuildVector(VT, dl, Ops);

7191

}

7192

7193

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

7194

}

7195

7196

/// Returns a vector_shuffle node for an unpackl operation.

7197

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7198

SDValue V1, SDValue V2) {

7199

SmallVector<int, 8> Mask;

7200

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

7201

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7202

}

7203

7204

/// Returns a vector_shuffle node for an unpackh operation.

7205

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7206

SDValue V1, SDValue V2) {

7207

SmallVector<int, 8> Mask;

7208

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

7209

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7210

}

7211

7212

/// Returns a node that packs the LHS + RHS nodes together at half width.

7213

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

7214

/// TODO: Add subvector splitting if/when we have a need for it.

7215

static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

7216

const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

7217

bool PackHiHalf = false) {

7218

MVT OpVT = LHS.getSimpleValueType();

7219

unsigned EltSizeInBits = VT.getScalarSizeInBits();

7220

bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

7221

assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7224, __extension__
__PRETTY_FUNCTION__))

7222

VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7224, __extension__
__PRETTY_FUNCTION__))

7223

(EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7224, __extension__
__PRETTY_FUNCTION__))

7224

"Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7224, __extension__
__PRETTY_FUNCTION__));

7225

assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7226, __extension__
__PRETTY_FUNCTION__))

7226

"Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7226, __extension__
__PRETTY_FUNCTION__));

7227

7228

// Rely on vector shuffles for vXi64 -> vXi32 packing.

7229

if (EltSizeInBits == 32) {

7230

SmallVector<int> PackMask;

7231

int Offset = PackHiHalf ? 1 : 0;

7232

int NumElts = VT.getVectorNumElements();

7233

for (int I = 0; I != NumElts; I += 4) {

7234

PackMask.push_back(I + Offset);

7235

PackMask.push_back(I + Offset + 2);

7236

PackMask.push_back(I + Offset + NumElts);

7237

PackMask.push_back(I + Offset + NumElts + 2);

7238

}

7239

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

7240

DAG.getBitcast(VT, RHS), PackMask);

7241

}

7242

7243

// See if we already have sufficient leading bits for PACKSS/PACKUS.

7244

if (!PackHiHalf) {

7245

if (UsePackUS &&

7246

DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

7247

DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

7248

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7249

7250

if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

7251

DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

7252

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7253

}

7254

7255

// Fallback to sign/zero extending the requested half and pack.

7256

SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

7257

if (UsePackUS) {

7258

if (PackHiHalf) {

7259

LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

7260

RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

7261

} else {

7262

SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

7263

LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

7264

RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

7265

};

7266

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7267

};

7268

7269

if (!PackHiHalf) {

7270

LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

7271

RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

7272

}

7273

LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

7274

RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

7275

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7276

}

7277

7278

/// Return a vector_shuffle of the specified vector of zero or undef vector.

7279

/// This produces a shuffle where the low element of V2 is swizzled into the

7280

/// zero/undef vector, landing at element Idx.

7281

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

7282

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

7283

bool IsZero,

7284

const X86Subtarget &Subtarget,

7285

SelectionDAG &DAG) {

7286

MVT VT = V2.getSimpleValueType();

7287

SDValue V1 = IsZero

7288

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

7289

int NumElems = VT.getVectorNumElements();

7290

SmallVector<int, 16> MaskVec(NumElems);

7291

for (int i = 0; i != NumElems; ++i)

7292

// If this is the insertion idx, put the low elt of V2 here.

7293

MaskVec[i] = (i == Idx) ? NumElems : i;

7294

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

7295

}

7296

7297

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

7298

if (Ptr.getOpcode() == X86ISD::Wrapper ||

7299

Ptr.getOpcode() == X86ISD::WrapperRIP)

7300

Ptr = Ptr.getOperand(0);

7301

7302

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

7303

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

7304

return nullptr;

7305

7306

return CNode->getConstVal();

7307

}

7308

7309

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

7310

if (!Load || !ISD::isNormalLoad(Load))

7311

return nullptr;

7312

return getTargetConstantFromBasePtr(Load->getBasePtr());

7313

}

7314

7315

static const Constant *getTargetConstantFromNode(SDValue Op) {

7316

Op = peekThroughBitcasts(Op);

7317

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

7318

}

7319

7320

const Constant *

7321

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

7322

assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7322, __extension__
__PRETTY_FUNCTION__));

7323

return getTargetConstantFromNode(LD);

7324

}

7325

7326

// Extract raw constant bits from constant pools.

7327

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

7328

APInt &UndefElts,

7329

SmallVectorImpl<APInt> &EltBits,

7330

bool AllowWholeUndefs = true,

7331

bool AllowPartialUndefs = true) {

7332

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7332, __extension__
__PRETTY_FUNCTION__));

7333

7334

Op = peekThroughBitcasts(Op);

7335

7336

EVT VT = Op.getValueType();

7337

unsigned SizeInBits = VT.getSizeInBits();

7338

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7338, __extension__
__PRETTY_FUNCTION__));

7339

unsigned NumElts = SizeInBits / EltSizeInBits;

7340

7341

// Bitcast a source array of element bits to the target size.

7342

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

7343

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

7344

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

7345

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7346, __extension__
__PRETTY_FUNCTION__))

7346

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7346, __extension__
__PRETTY_FUNCTION__));

7347

7348

// Don't split if we don't allow undef bits.

7349

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

7350

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

7351

return false;

7352

7353

// If we're already the right size, don't bother bitcasting.

7354

if (NumSrcElts == NumElts) {

7355

UndefElts = UndefSrcElts;

7356

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

7357

return true;

7358

}

7359

7360

// Extract all the undef/constant element data and pack into single bitsets.

7361

APInt UndefBits(SizeInBits, 0);

7362

APInt MaskBits(SizeInBits, 0);

7363

7364

for (unsigned i = 0; i != NumSrcElts; ++i) {

7365

unsigned BitOffset = i * SrcEltSizeInBits;

7366

if (UndefSrcElts[i])

7367

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

7368

MaskBits.insertBits(SrcEltBits[i], BitOffset);

7369

}

7370

7371

// Split the undef/constant single bitset data into the target elements.

7372

UndefElts = APInt(NumElts, 0);

7373

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

7374

7375

for (unsigned i = 0; i != NumElts; ++i) {

7376

unsigned BitOffset = i * EltSizeInBits;

7377

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

7378

7379

// Only treat an element as UNDEF if all bits are UNDEF.

7380

if (UndefEltBits.isAllOnes()) {

7381

if (!AllowWholeUndefs)

7382

return false;

7383

UndefElts.setBit(i);

7384

continue;

7385

}

7386

7387

// If only some bits are UNDEF then treat them as zero (or bail if not

7388

// supported).

7389

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

7390

return false;

7391

7392

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

7393

}

7394

return true;

7395

};

7396

7397

// Collect constant bits and insert into mask/undef bit masks.

7398

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

7399

unsigned UndefBitIndex) {

7400

if (!Cst)

7401

return false;

7402

if (isa<UndefValue>(Cst)) {

7403

Undefs.setBit(UndefBitIndex);

7404

return true;

7405

}

7406

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

7407

Mask = CInt->getValue();

7408

return true;

7409

}

7410

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

7411

Mask = CFP->getValueAPF().bitcastToAPInt();

7412

return true;

7413

}

7414

return false;

7415

};

7416

7417

// Handle UNDEFs.

7418

if (Op.isUndef()) {

7419

APInt UndefSrcElts = APInt::getAllOnes(NumElts);

7420

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

7421

return CastBitData(UndefSrcElts, SrcEltBits);

7422

}

7423

7424

// Extract scalar constant bits.

7425

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

7426

APInt UndefSrcElts = APInt::getZero(1);

7427

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

7428

return CastBitData(UndefSrcElts, SrcEltBits);

7429

}

7430

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7431

APInt UndefSrcElts = APInt::getZero(1);

7432

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

7433

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

7434

return CastBitData(UndefSrcElts, SrcEltBits);

7435

}

7436

7437

// Extract constant bits from build vector.

7438

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

7439

BitVector Undefs;

7440

SmallVector<APInt> SrcEltBits;

7441

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7442

if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

7443

APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());

7444

for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

7445

if (Undefs[I])

7446

UndefSrcElts.setBit(I);

7447

return CastBitData(UndefSrcElts, SrcEltBits);

7448

}

7449

}

7450

7451

// Extract constant bits from constant pool vector.

7452

if (auto *Cst = getTargetConstantFromNode(Op)) {

7453

Type *CstTy = Cst->getType();

7454

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7455

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

7456

return false;

7457

7458

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

7459

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7460

7461

APInt UndefSrcElts(NumSrcElts, 0);

7462

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

7463

for (unsigned i = 0; i != NumSrcElts; ++i)

7464

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

7465

UndefSrcElts, i))

7466

return false;

7467

7468

return CastBitData(UndefSrcElts, SrcEltBits);

7469

}

7470

7471

// Extract constant bits from a broadcasted constant pool scalar.

7472

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

7473

EltSizeInBits <= VT.getScalarSizeInBits()) {

7474

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7475

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

7476

return false;

7477

7478

SDValue Ptr = MemIntr->getBasePtr();

7479

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

7480

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

7481

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7482

7483

APInt UndefSrcElts(NumSrcElts, 0);

7484

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

7485

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

7486

if (UndefSrcElts[0])

7487

UndefSrcElts.setBits(0, NumSrcElts);

7488

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

7489

return CastBitData(UndefSrcElts, SrcEltBits);

7490

}

7491

}

7492

}

7493

7494

// Extract constant bits from a subvector broadcast.

7495

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

7496

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7497

SDValue Ptr = MemIntr->getBasePtr();

7498

// The source constant may be larger than the subvector broadcast,

7499

// ensure we extract the correct subvector constants.

7500

if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

7501

Type *CstTy = Cst->getType();

7502

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7503

unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

7504

if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

7505

(SizeInBits % SubVecSizeInBits) != 0)

7506

return false;

7507

unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

7508

unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

7509

unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

7510

APInt UndefSubElts(NumSubElts, 0);

7511

SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

7512

APInt(CstEltSizeInBits, 0));

7513

for (unsigned i = 0; i != NumSubElts; ++i) {

7514

if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

7515

UndefSubElts, i))

7516

return false;

7517

for (unsigned j = 1; j != NumSubVecs; ++j)

7518

SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

7519

}

7520

UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

7521

UndefSubElts);

7522

return CastBitData(UndefSubElts, SubEltBits);

7523

}

7524

}

7525

7526

// Extract a rematerialized scalar constant insertion.

7527

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

7528

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

7529

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

7530

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7531

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7532

7533

APInt UndefSrcElts(NumSrcElts, 0);

7534

SmallVector<APInt, 64> SrcEltBits;

7535

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

7536

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

7537

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

7538

return CastBitData(UndefSrcElts, SrcEltBits);

7539

}

7540

7541

// Insert constant bits from a base and sub vector sources.

7542

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

7543

// If bitcasts to larger elements we might lose track of undefs - don't

7544

// allow any to be safe.

7545

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7546

bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;

7547

7548

APInt UndefSrcElts, UndefSubElts;

7549

SmallVector<APInt, 32> EltSrcBits, EltSubBits;

7550

if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

7551

UndefSubElts, EltSubBits,

7552

AllowWholeUndefs && AllowUndefs,

7553

AllowPartialUndefs && AllowUndefs) &&

7554

getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

7555

UndefSrcElts, EltSrcBits,

7556

AllowWholeUndefs && AllowUndefs,

7557

AllowPartialUndefs && AllowUndefs)) {

7558

unsigned BaseIdx = Op.getConstantOperandVal(2);

7559

UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

7560

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

7561

EltSrcBits[BaseIdx + i] = EltSubBits[i];

7562

return CastBitData(UndefSrcElts, EltSrcBits);

7563

}

7564

}

7565

7566

// Extract constant bits from a subvector's source.

7567

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

7568

// TODO - support extract_subvector through bitcasts.

7569

if (EltSizeInBits != VT.getScalarSizeInBits())

7570

return false;

7571

7572

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7573

UndefElts, EltBits, AllowWholeUndefs,

7574

AllowPartialUndefs)) {

7575

EVT SrcVT = Op.getOperand(0).getValueType();

7576

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7577

unsigned NumSubElts = VT.getVectorNumElements();

7578

unsigned BaseIdx = Op.getConstantOperandVal(1);

7579

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

7580

if ((BaseIdx + NumSubElts) != NumSrcElts)

7581

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

7582

if (BaseIdx != 0)

7583

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

7584

return true;

7585

}

7586

}

7587

7588

// Extract constant bits from shuffle node sources.

7589

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

7590

// TODO - support shuffle through bitcasts.

7591

if (EltSizeInBits != VT.getScalarSizeInBits())

7592

return false;

7593

7594

ArrayRef<int> Mask = SVN->getMask();

7595

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

7596

llvm::any_of(Mask, [](int M) { return M < 0; }))

7597

return false;

7598

7599

APInt UndefElts0, UndefElts1;

7600

SmallVector<APInt, 32> EltBits0, EltBits1;

7601

if (isAnyInRange(Mask, 0, NumElts) &&

7602

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7603

UndefElts0, EltBits0, AllowWholeUndefs,

7604

AllowPartialUndefs))

7605

return false;

7606

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

7607

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

7608

UndefElts1, EltBits1, AllowWholeUndefs,

7609

AllowPartialUndefs))

7610

return false;

7611

7612

UndefElts = APInt::getZero(NumElts);

7613

for (int i = 0; i != (int)NumElts; ++i) {

7614

int M = Mask[i];

7615

if (M < 0) {

7616

UndefElts.setBit(i);

7617

EltBits.push_back(APInt::getZero(EltSizeInBits));

7618

} else if (M < (int)NumElts) {

7619

if (UndefElts0[M])

7620

UndefElts.setBit(i);

7621

EltBits.push_back(EltBits0[M]);

7622

} else {

7623

if (UndefElts1[M - NumElts])

7624

UndefElts.setBit(i);

7625

EltBits.push_back(EltBits1[M - NumElts]);

7626

}

7627

}

7628

return true;

7629

}

7630

7631

return false;

7632

}

7633

7634

namespace llvm {

7635

namespace X86 {

7636

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

7637

APInt UndefElts;

7638

SmallVector<APInt, 16> EltBits;

7639

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

7640

UndefElts, EltBits, true,

7641

AllowPartialUndefs)) {

7642

int SplatIndex = -1;

7643

for (int i = 0, e = EltBits.size(); i != e; ++i) {

7644

if (UndefElts[i])

7645

continue;

7646

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

7647

SplatIndex = -1;

7648

break;

7649

}

7650

SplatIndex = i;

7651

}

7652

if (0 <= SplatIndex) {

7653

SplatVal = EltBits[SplatIndex];

7654

return true;

7655

}

7656

}

7657

7658

return false;

7659

}

7660

} // namespace X86

7661

} // namespace llvm

7662

7663

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

7664

unsigned MaskEltSizeInBits,

7665

SmallVectorImpl<uint64_t> &RawMask,

7666

APInt &UndefElts) {

7667

// Extract the raw target constant bits.

7668

SmallVector<APInt, 64> EltBits;

7669

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

7670

EltBits, /* AllowWholeUndefs */ true,

7671

/* AllowPartialUndefs */ false))

7672

return false;

7673

7674

// Insert the extracted elements into the mask.

7675

for (const APInt &Elt : EltBits)

7676

RawMask.push_back(Elt.getZExtValue());

7677

7678

return true;

7679

}

7680

7681

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

7682

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

7683

/// Note: This ignores saturation, so inputs must be checked first.

7684

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7685

bool Unary, unsigned NumStages = 1) {

7686

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7686, __extension__
__PRETTY_FUNCTION__));

7687

unsigned NumElts = VT.getVectorNumElements();

7688

unsigned NumLanes = VT.getSizeInBits() / 128;

7689

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

7690

unsigned Offset = Unary ? 0 : NumElts;

7691

unsigned Repetitions = 1u << (NumStages - 1);

7692

unsigned Increment = 1u << NumStages;

7693

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7693, __extension__
__PRETTY_FUNCTION__));

7694

7695

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

7696

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

7697

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7698

Mask.push_back(Elt + (Lane * NumEltsPerLane));

7699

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7700

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

7701

}

7702

}

7703

}

7704

7705

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

7706

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

7707

APInt &DemandedLHS, APInt &DemandedRHS) {

7708

int NumLanes = VT.getSizeInBits() / 128;

7709

int NumElts = DemandedElts.getBitWidth();

7710

int NumInnerElts = NumElts / 2;

7711

int NumEltsPerLane = NumElts / NumLanes;

7712

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

7713

7714

DemandedLHS = APInt::getZero(NumInnerElts);

7715

DemandedRHS = APInt::getZero(NumInnerElts);

7716

7717

// Map DemandedElts to the packed operands.

7718

for (int Lane = 0; Lane != NumLanes; ++Lane) {

7719

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

7720

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

7721

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

7722

if (DemandedElts[OuterIdx])

7723

DemandedLHS.setBit(InnerIdx);

7724

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

7725

DemandedRHS.setBit(InnerIdx);

7726

}

7727

}

7728

}

7729

7730

// Split the demanded elts of a HADD/HSUB node between its operands.

7731

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

7732

APInt &DemandedLHS, APInt &DemandedRHS) {

7733

int NumLanes = VT.getSizeInBits() / 128;

7734

int NumElts = DemandedElts.getBitWidth();

7735

int NumEltsPerLane = NumElts / NumLanes;

7736

int HalfEltsPerLane = NumEltsPerLane / 2;

7737

7738

DemandedLHS = APInt::getZero(NumElts);

7739

DemandedRHS = APInt::getZero(NumElts);

7740

7741

// Map DemandedElts to the horizontal operands.

7742

for (int Idx = 0; Idx != NumElts; ++Idx) {

7743

if (!DemandedElts[Idx])

7744

continue;

7745

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

7746

int LocalIdx = Idx % NumEltsPerLane;

7747

if (LocalIdx < HalfEltsPerLane) {

7748

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7749

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7750

} else {

7751

LocalIdx -= HalfEltsPerLane;

7752

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7753

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7754

}

7755

}

7756

}

7757

7758

/// Calculates the shuffle mask corresponding to the target-specific opcode.

7759

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

7760

/// operands in \p Ops, and returns true.

7761

/// Sets \p IsUnary to true if only one source is used. Note that this will set

7762

/// IsUnary for shuffles which use a single input multiple times, and in those

7763

/// cases it will adjust the mask to only have indices within that single input.

7764

/// It is an error to call this with non-empty Mask/Ops vectors.

7765

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7766

SmallVectorImpl<SDValue> &Ops,

7767

SmallVectorImpl<int> &Mask, bool &IsUnary) {

7768

unsigned NumElems = VT.getVectorNumElements();

7769

unsigned MaskEltSize = VT.getScalarSizeInBits();

7770

SmallVector<uint64_t, 32> RawMask;

7771

APInt RawUndefs;

7772

uint64_t ImmN;

7773

7774

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7774, __extension__
__PRETTY_FUNCTION__));

7775

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7775, __extension__
__PRETTY_FUNCTION__));

7776

7777

IsUnary = false;

7778

bool IsFakeUnary = false;

7779

switch (N->getOpcode()) {

7780

case X86ISD::BLENDI:

7781

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7781, __extension__
__PRETTY_FUNCTION__));

7782

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7782, __extension__
__PRETTY_FUNCTION__));

7783

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7784

DecodeBLENDMask(NumElems, ImmN, Mask);

7785

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7786

break;

7787

case X86ISD::SHUFP:

7788

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7788, __extension__
__PRETTY_FUNCTION__));

7789

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7789, __extension__
__PRETTY_FUNCTION__));

7790

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7791

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

7792

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7793

break;

7794

case X86ISD::INSERTPS:

7795

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7795, __extension__
__PRETTY_FUNCTION__));

7796

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7796, __extension__
__PRETTY_FUNCTION__));

7797

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7798

DecodeINSERTPSMask(ImmN, Mask);

7799

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7800

break;

7801

case X86ISD::EXTRQI:

7802

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7802, __extension__
__PRETTY_FUNCTION__));

7803

if (isa<ConstantSDNode>(N->getOperand(1)) &&

7804

isa<ConstantSDNode>(N->getOperand(2))) {

7805

int BitLen = N->getConstantOperandVal(1);

7806

int BitIdx = N->getConstantOperandVal(2);

7807

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7808

IsUnary = true;

7809

}

7810

break;

7811

case X86ISD::INSERTQI:

7812

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7812, __extension__
__PRETTY_FUNCTION__));

7813

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__
__PRETTY_FUNCTION__));

7814

if (isa<ConstantSDNode>(N->getOperand(2)) &&

7815

isa<ConstantSDNode>(N->getOperand(3))) {

7816

int BitLen = N->getConstantOperandVal(2);

7817

int BitIdx = N->getConstantOperandVal(3);

7818

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7819

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7820

}

7821

break;

7822

case X86ISD::UNPCKH:

7823

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7823, __extension__
__PRETTY_FUNCTION__));

7824

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7824, __extension__
__PRETTY_FUNCTION__));

7825

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

7826

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7827

break;

7828

case X86ISD::UNPCKL:

7829

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7829, __extension__
__PRETTY_FUNCTION__));

7830

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7830, __extension__
__PRETTY_FUNCTION__));

7831

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

7832

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7833

break;

7834

case X86ISD::MOVHLPS:

7835

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7835, __extension__
__PRETTY_FUNCTION__));

7836

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7836, __extension__
__PRETTY_FUNCTION__));

7837

DecodeMOVHLPSMask(NumElems, Mask);

7838

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7839

break;

7840

case X86ISD::MOVLHPS:

7841

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7841, __extension__
__PRETTY_FUNCTION__));

7842

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7842, __extension__
__PRETTY_FUNCTION__));

7843

DecodeMOVLHPSMask(NumElems, Mask);

7844

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7845

break;

7846

case X86ISD::VALIGN:

7847

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7848, __extension__
__PRETTY_FUNCTION__))

7848

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7848, __extension__
__PRETTY_FUNCTION__));

7849

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7849, __extension__
__PRETTY_FUNCTION__));

7850

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7850, __extension__
__PRETTY_FUNCTION__));

7851

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7852

DecodeVALIGNMask(NumElems, ImmN, Mask);

7853

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7854

Ops.push_back(N->getOperand(1));

7855

Ops.push_back(N->getOperand(0));

7856

break;

7857

case X86ISD::PALIGNR:

7858

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7858, __extension__
__PRETTY_FUNCTION__));

7859

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__
__PRETTY_FUNCTION__));

7860

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7860, __extension__
__PRETTY_FUNCTION__));

7861

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7862

DecodePALIGNRMask(NumElems, ImmN, Mask);

7863

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7864

Ops.push_back(N->getOperand(1));

7865

Ops.push_back(N->getOperand(0));

7866

break;

7867

case X86ISD::VSHLDQ:

7868

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7868, __extension__
__PRETTY_FUNCTION__));

7869

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7869, __extension__
__PRETTY_FUNCTION__));

7870

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7871

DecodePSLLDQMask(NumElems, ImmN, Mask);

7872

IsUnary = true;

7873

break;

7874

case X86ISD::VSRLDQ:

7875

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7875, __extension__
__PRETTY_FUNCTION__));

7876

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7876, __extension__
__PRETTY_FUNCTION__));

7877

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7878

DecodePSRLDQMask(NumElems, ImmN, Mask);

7879

IsUnary = true;

7880

break;

7881

case X86ISD::PSHUFD:

7882

case X86ISD::VPERMILPI:

7883

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7883, __extension__
__PRETTY_FUNCTION__));

7884

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7885

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

7886

IsUnary = true;

7887

break;

7888

case X86ISD::PSHUFHW:

7889

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7889, __extension__
__PRETTY_FUNCTION__));

7890

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7891

DecodePSHUFHWMask(NumElems, ImmN, Mask);

7892

IsUnary = true;

7893

break;

7894

case X86ISD::PSHUFLW:

7895

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7895, __extension__
__PRETTY_FUNCTION__));

7896

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7897

DecodePSHUFLWMask(NumElems, ImmN, Mask);

7898

IsUnary = true;

7899

break;

7900

case X86ISD::VZEXT_MOVL:

7901

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7901, __extension__
__PRETTY_FUNCTION__));

7902

DecodeZeroMoveLowMask(NumElems, Mask);

7903

IsUnary = true;

7904

break;

7905

case X86ISD::VBROADCAST:

7906

// We only decode broadcasts of same-sized vectors, peeking through to

7907

// extracted subvectors is likely to cause hasOneUse issues with

7908

// SimplifyDemandedBits etc.

7909

if (N->getOperand(0).getValueType() == VT) {

7910

DecodeVectorBroadcast(NumElems, Mask);

7911

IsUnary = true;

7912

break;

7913

}

7914

return false;

7915

case X86ISD::VPERMILPV: {

7916

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7916, __extension__
__PRETTY_FUNCTION__));

7917

IsUnary = true;

7918

SDValue MaskNode = N->getOperand(1);

7919

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7920

RawUndefs)) {

7921

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

7922

break;

7923

}

7924

return false;

7925

}

7926

case X86ISD::PSHUFB: {

7927

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7927, __extension__
__PRETTY_FUNCTION__));

7928

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7928, __extension__
__PRETTY_FUNCTION__));

7929

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7929, __extension__
__PRETTY_FUNCTION__));

7930

IsUnary = true;

7931

SDValue MaskNode = N->getOperand(1);

7932

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7933

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

7934

break;

7935

}

7936

return false;

7937

}

7938

case X86ISD::VPERMI:

7939

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7939, __extension__
__PRETTY_FUNCTION__));

7940

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7941

DecodeVPERMMask(NumElems, ImmN, Mask);

7942

IsUnary = true;

7943

break;

7944

case X86ISD::MOVSS:

7945

case X86ISD::MOVSD:

7946

case X86ISD::MOVSH:

7947

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7947, __extension__
__PRETTY_FUNCTION__));

7948

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7948, __extension__
__PRETTY_FUNCTION__));

7949

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

7950

break;

7951

case X86ISD::VPERM2X128:

7952

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7952, __extension__
__PRETTY_FUNCTION__));

7953

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7953, __extension__
__PRETTY_FUNCTION__));

7954

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7955

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

7956

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7957

break;

7958

case X86ISD::SHUF128:

7959

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7959, __extension__
__PRETTY_FUNCTION__));

7960

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7960, __extension__
__PRETTY_FUNCTION__));

7961

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7962

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

7963

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7964

break;

7965

case X86ISD::MOVSLDUP:

7966

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7966, __extension__
__PRETTY_FUNCTION__));

7967

DecodeMOVSLDUPMask(NumElems, Mask);

7968

IsUnary = true;

7969

break;

7970

case X86ISD::MOVSHDUP:

7971

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7971, __extension__
__PRETTY_FUNCTION__));

7972

DecodeMOVSHDUPMask(NumElems, Mask);

7973

IsUnary = true;

7974

break;

7975

case X86ISD::MOVDDUP:

7976

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7976, __extension__
__PRETTY_FUNCTION__));

7977

DecodeMOVDDUPMask(NumElems, Mask);

7978

IsUnary = true;

7979

break;

7980

case X86ISD::VPERMIL2: {

7981

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7981, __extension__
__PRETTY_FUNCTION__));

7982

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7982, __extension__
__PRETTY_FUNCTION__));

7983

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7984

SDValue MaskNode = N->getOperand(2);

7985

SDValue CtrlNode = N->getOperand(3);

7986

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

7987

unsigned CtrlImm = CtrlOp->getZExtValue();

7988

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7989

RawUndefs)) {

7990

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

7991

Mask);

7992

break;

7993

}

7994

}

7995

return false;

7996

}

7997

case X86ISD::VPPERM: {

7998

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7998, __extension__
__PRETTY_FUNCTION__));

7999

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7999, __extension__
__PRETTY_FUNCTION__));

8000

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

8001

SDValue MaskNode = N->getOperand(2);

8002

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

8003

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

8004

break;

8005

}

8006

return false;

8007

}

8008

case X86ISD::VPERMV: {

8009

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8009, __extension__
__PRETTY_FUNCTION__));

8010

IsUnary = true;

8011

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

8012

Ops.push_back(N->getOperand(1));

8013

SDValue MaskNode = N->getOperand(0);

8014

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8015

RawUndefs)) {

8016

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

8017

break;

8018

}

8019

return false;

8020

}

8021

case X86ISD::VPERMV3: {

8022

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8022, __extension__
__PRETTY_FUNCTION__));

8023

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8023, __extension__
__PRETTY_FUNCTION__));

8024

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

8025

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

8026

Ops.push_back(N->getOperand(0));

8027

Ops.push_back(N->getOperand(2));

8028

SDValue MaskNode = N->getOperand(1);

8029

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8030

RawUndefs)) {

8031

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

8032

break;

8033

}

8034

return false;

8035

}

8036

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8036);

8037

}

8038

8039

// Empty mask indicates the decode failed.

8040

if (Mask.empty())

8041

return false;

8042

8043

// Check if we're getting a shuffle mask with zero'd elements.

8044

if (!AllowSentinelZero && isAnyZero(Mask))

8045

return false;

8046

8047

// If we have a fake unary shuffle, the shuffle mask is spread across two

8048

// inputs that are actually the same node. Re-map the mask to always point

8049

// into the first input.

8050

if (IsFakeUnary)

8051

for (int &M : Mask)

8052

if (M >= (int)Mask.size())

8053

M -= Mask.size();

8054

8055

// If we didn't already add operands in the opcode-specific code, default to

8056

// adding 1 or 2 operands starting at 0.

8057

if (Ops.empty()) {

8058

Ops.push_back(N->getOperand(0));

8059

if (!IsUnary || IsFakeUnary)

8060

Ops.push_back(N->getOperand(1));

8061

}

8062

8063

return true;

8064

}

8065

8066

// Wrapper for getTargetShuffleMask with InUnary;

8067

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

8068

SmallVectorImpl<SDValue> &Ops,

8069

SmallVectorImpl<int> &Mask) {

8070

bool IsUnary;

8071

return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);

8072

}

8073

8074

/// Compute whether each element of a shuffle is zeroable.

8075

///

8076

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

8077

/// Either it is an undef element in the shuffle mask, the element of the input

8078

/// referenced is undef, or the element of the input referenced is known to be

8079

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

8080

/// as many lanes with this technique as possible to simplify the remaining

8081

/// shuffle.

8082

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

8083

SDValue V1, SDValue V2,

8084

APInt &KnownUndef, APInt &KnownZero) {

8085

int Size = Mask.size();

8086

KnownUndef = KnownZero = APInt::getZero(Size);

8087

8088

V1 = peekThroughBitcasts(V1);

8089

V2 = peekThroughBitcasts(V2);

8090

8091

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

8092

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

8093

8094

int VectorSizeInBits = V1.getValueSizeInBits();

8095

int ScalarSizeInBits = VectorSizeInBits / Size;

8096

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8096, __extension__
__PRETTY_FUNCTION__));

8097

8098

for (int i = 0; i < Size; ++i) {

8099

int M = Mask[i];

8100

// Handle the easy cases.

8101

if (M < 0) {

8102

KnownUndef.setBit(i);

8103

continue;

8104

}

8105

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

8106

KnownZero.setBit(i);

8107

continue;

8108

}

8109

8110

// Determine shuffle input and normalize the mask.

8111

SDValue V = M < Size ? V1 : V2;

8112

M %= Size;

8113

8114

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

8115

if (V.getOpcode() != ISD::BUILD_VECTOR)

8116

continue;

8117

8118

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

8119

// the (larger) source element must be UNDEF/ZERO.

8120

if ((Size % V.getNumOperands()) == 0) {

8121

int Scale = Size / V->getNumOperands();

8122

SDValue Op = V.getOperand(M / Scale);

8123

if (Op.isUndef())

8124

KnownUndef.setBit(i);

8125

if (X86::isZeroNode(Op))

8126

KnownZero.setBit(i);

8127

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

8128

APInt Val = Cst->getAPIntValue();

8129

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8130

if (Val == 0)

8131

KnownZero.setBit(i);

8132

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

8133

APInt Val = Cst->getValueAPF().bitcastToAPInt();

8134

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8135

if (Val == 0)

8136

KnownZero.setBit(i);

8137

}

8138

continue;

8139

}

8140

8141

// If the BUILD_VECTOR has more elements then all the (smaller) source

8142

// elements must be UNDEF or ZERO.

8143

if ((V.getNumOperands() % Size) == 0) {

8144

int Scale = V->getNumOperands() / Size;

8145

bool AllUndef = true;

8146

bool AllZero = true;

8147

for (int j = 0; j < Scale; ++j) {

8148

SDValue Op = V.getOperand((M * Scale) + j);

8149

AllUndef &= Op.isUndef();

8150

AllZero &= X86::isZeroNode(Op);

8151

}

8152

if (AllUndef)

8153

KnownUndef.setBit(i);

8154

if (AllZero)

8155

KnownZero.setBit(i);

8156

continue;

8157

}

8158

}

8159

}

8160

8161

/// Decode a target shuffle mask and inputs and see if any values are

8162

/// known to be undef or zero from their inputs.

8163

/// Returns true if the target shuffle mask was decoded.

8164

/// FIXME: Merge this with computeZeroableShuffleElements?

8165

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

8166

SmallVectorImpl<SDValue> &Ops,

8167

APInt &KnownUndef, APInt &KnownZero) {

8168

bool IsUnary;

8169

if (!isTargetShuffle(N.getOpcode()))

8170

return false;

8171

8172

MVT VT = N.getSimpleValueType();

8173

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

8174

return false;

8175

8176

int Size = Mask.size();

8177

SDValue V1 = Ops[0];

8178

SDValue V2 = IsUnary ? V1 : Ops[1];

8179

KnownUndef = KnownZero = APInt::getZero(Size);

8180

8181

V1 = peekThroughBitcasts(V1);

8182

V2 = peekThroughBitcasts(V2);

8183

8184

assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8185, __extension__
__PRETTY_FUNCTION__))

8185

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8185, __extension__
__PRETTY_FUNCTION__));

8186

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

8187

8188

// Extract known constant input data.

8189

APInt UndefSrcElts[2];

8190

SmallVector<APInt, 32> SrcEltBits[2];

8191

bool IsSrcConstant[2] = {

8192

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

8193

SrcEltBits[0], true, false),

8194

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

8195

SrcEltBits[1], true, false)};

8196

8197

for (int i = 0; i < Size; ++i) {

8198

int M = Mask[i];

8199

8200

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

8201

if (M < 0) {

8202

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8202, __extension__
__PRETTY_FUNCTION__));

8203

if (SM_SentinelUndef == M)

8204

KnownUndef.setBit(i);

8205

if (SM_SentinelZero == M)

8206

KnownZero.setBit(i);

8207

continue;

8208

}

8209

8210

// Determine shuffle input and normalize the mask.

8211

unsigned SrcIdx = M / Size;

8212

SDValue V = M < Size ? V1 : V2;

8213

M %= Size;

8214

8215

// We are referencing an UNDEF input.

8216

if (V.isUndef()) {

8217

KnownUndef.setBit(i);

8218

continue;

8219

}

8220

8221

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

8222

// TODO: We currently only set UNDEF for integer types - floats use the same

8223

// registers as vectors and many of the scalar folded loads rely on the

8224

// SCALAR_TO_VECTOR pattern.

8225

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

8226

(Size % V.getValueType().getVectorNumElements()) == 0) {

8227

int Scale = Size / V.getValueType().getVectorNumElements();

8228

int Idx = M / Scale;

8229

if (Idx != 0 && !VT.isFloatingPoint())

8230

KnownUndef.setBit(i);

8231

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

8232

KnownZero.setBit(i);

8233

continue;

8234

}

8235

8236

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

8237

// base vectors.

8238

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

8239

SDValue Vec = V.getOperand(0);

8240

int NumVecElts = Vec.getValueType().getVectorNumElements();

8241

if (Vec.isUndef() && Size == NumVecElts) {

8242

int Idx = V.getConstantOperandVal(2);

8243

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

8244

if (M < Idx || (Idx + NumSubElts) <= M)

8245

KnownUndef.setBit(i);

8246

}

8247

continue;

8248

}

8249

8250

// Attempt to extract from the source's constant bits.

8251

if (IsSrcConstant[SrcIdx]) {

8252

if (UndefSrcElts[SrcIdx][M])

8253

KnownUndef.setBit(i);

8254

else if (SrcEltBits[SrcIdx][M] == 0)

8255

KnownZero.setBit(i);

8256

}

8257

}

8258

8259

assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8260, __extension__
__PRETTY_FUNCTION__))

8260

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8260, __extension__
__PRETTY_FUNCTION__));

8261

return true;

8262

}

8263

8264

// Replace target shuffle mask elements with known undef/zero sentinels.

8265

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

8266

const APInt &KnownUndef,

8267

const APInt &KnownZero,

8268

bool ResolveKnownZeros= true) {

8269

unsigned NumElts = Mask.size();

8270

assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8271, __extension__
__PRETTY_FUNCTION__))

8271

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8271, __extension__
__PRETTY_FUNCTION__));

8272

8273

for (unsigned i = 0; i != NumElts; ++i) {

8274

if (KnownUndef[i])

8275

Mask[i] = SM_SentinelUndef;

8276

else if (ResolveKnownZeros && KnownZero[i])

8277

Mask[i] = SM_SentinelZero;

8278

}

8279

}

8280

8281

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

8282

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

8283

APInt &KnownUndef,

8284

APInt &KnownZero) {

8285

unsigned NumElts = Mask.size();

8286

KnownUndef = KnownZero = APInt::getZero(NumElts);

8287

8288

for (unsigned i = 0; i != NumElts; ++i) {

8289

int M = Mask[i];

8290

if (SM_SentinelUndef == M)

8291

KnownUndef.setBit(i);

8292

if (SM_SentinelZero == M)

8293

KnownZero.setBit(i);

8294

}

8295

}

8296

8297

// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.

8298

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

8299

SDValue Cond, bool IsBLENDV = false) {

8300

EVT CondVT = Cond.getValueType();

8301

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

8302

unsigned NumElts = CondVT.getVectorNumElements();

8303

8304

APInt UndefElts;

8305

SmallVector<APInt, 32> EltBits;

8306

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

8307

true, false))

8308

return false;

8309

8310

Mask.resize(NumElts, SM_SentinelUndef);

8311

8312

for (int i = 0; i != (int)NumElts; ++i) {

8313

Mask[i] = i;

8314

// Arbitrarily choose from the 2nd operand if the select condition element

8315

// is undef.

8316

// TODO: Can we do better by matching patterns such as even/odd?

8317

if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

8318

(IsBLENDV && EltBits[i].isNonNegative()))

8319

Mask[i] += NumElts;

8320

}

8321

8322

return true;

8323

}

8324

8325

// Forward declaration (for getFauxShuffleMask recursive check).

8326

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8327

SmallVectorImpl<SDValue> &Inputs,

8328

SmallVectorImpl<int> &Mask,

8329

const SelectionDAG &DAG, unsigned Depth,

8330

bool ResolveKnownElts);

8331

8332

// Attempt to decode ops that could be represented as a shuffle mask.

8333

// The decoded shuffle mask may contain a different number of elements to the

8334

// destination value type.

8335

// TODO: Merge into getTargetShuffleInputs()

8336

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

8337

SmallVectorImpl<int> &Mask,

8338

SmallVectorImpl<SDValue> &Ops,

8339

const SelectionDAG &DAG, unsigned Depth,

8340

bool ResolveKnownElts) {

8341

Mask.clear();

8342

Ops.clear();

8343

8344

MVT VT = N.getSimpleValueType();

8345

unsigned NumElts = VT.getVectorNumElements();

8346

unsigned NumSizeInBits = VT.getSizeInBits();

8347

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

8348

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

8349

return false;

8350

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8350, __extension__
__PRETTY_FUNCTION__));

8351

unsigned NumSizeInBytes = NumSizeInBits / 8;

8352

unsigned NumBytesPerElt = NumBitsPerElt / 8;

8353

8354

unsigned Opcode = N.getOpcode();

8355

switch (Opcode) {

8356

case ISD::VECTOR_SHUFFLE: {

8357

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

8358

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

8359

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

8360

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

8361

Ops.push_back(N.getOperand(0));

8362

Ops.push_back(N.getOperand(1));

8363

return true;

8364

}

8365

return false;

8366

}

8367

case ISD::AND:

8368

case X86ISD::ANDNP: {

8369

// Attempt to decode as a per-byte mask.

8370

APInt UndefElts;

8371

SmallVector<APInt, 32> EltBits;

8372

SDValue N0 = N.getOperand(0);

8373

SDValue N1 = N.getOperand(1);

8374

bool IsAndN = (X86ISD::ANDNP == Opcode);

8375

uint64_t ZeroMask = IsAndN ? 255 : 0;

8376

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

8377

return false;

8378

// We can't assume an undef src element gives an undef dst - the other src

8379

// might be zero.

8380

if (!UndefElts.isZero())

8381

return false;

8382

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

8383

const APInt &ByteBits = EltBits[i];

8384

if (ByteBits != 0 && ByteBits != 255)

8385

return false;

8386

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

8387

}

8388

Ops.push_back(IsAndN ? N1 : N0);

8389

return true;

8390

}

8391

case ISD::OR: {

8392

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

8393

// is a valid shuffle index.

8394

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

8395

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

8396

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

8397

return false;

8398

8399

SmallVector<int, 64> SrcMask0, SrcMask1;

8400

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

8401

APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());

8402

APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());

8403

if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,

8404

Depth + 1, true) ||

8405

!getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,

8406

Depth + 1, true))

8407

return false;

8408

8409

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

8410

SmallVector<int, 64> Mask0, Mask1;

8411

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

8412

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

8413

for (int i = 0; i != (int)MaskSize; ++i) {

8414

// NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

8415

// loops converting between OR and BLEND shuffles due to

8416

// canWidenShuffleElements merging away undef elements, meaning we

8417

// fail to recognise the OR as the undef element isn't known zero.

8418

if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

8419

Mask.push_back(SM_SentinelZero);

8420

else if (Mask1[i] == SM_SentinelZero)

8421

Mask.push_back(i);

8422

else if (Mask0[i] == SM_SentinelZero)

8423

Mask.push_back(i + MaskSize);

8424

else

8425

return false;

8426

}

8427

Ops.push_back(N0);

8428

Ops.push_back(N1);

8429

return true;

8430

}

8431

case ISD::INSERT_SUBVECTOR: {

8432

SDValue Src = N.getOperand(0);

8433

SDValue Sub = N.getOperand(1);

8434

EVT SubVT = Sub.getValueType();

8435

unsigned NumSubElts = SubVT.getVectorNumElements();

8436

if (!N->isOnlyUserOf(Sub.getNode()))

8437

return false;

8438

uint64_t InsertIdx = N.getConstantOperandVal(2);

8439

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

8440

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

8441

Sub.getOperand(0).getValueType() == VT) {

8442

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

8443

for (int i = 0; i != (int)NumElts; ++i)

8444

Mask.push_back(i);

8445

for (int i = 0; i != (int)NumSubElts; ++i)

8446

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

8447

Ops.push_back(Src);

8448

Ops.push_back(Sub.getOperand(0));

8449

return true;

8450

}

8451

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

8452

SmallVector<int, 64> SubMask;

8453

SmallVector<SDValue, 2> SubInputs;

8454

SDValue SubSrc = peekThroughOneUseBitcasts(Sub);

8455

EVT SubSrcVT = SubSrc.getValueType();

8456

if (!SubSrcVT.isVector())

8457

return false;

8458

8459

APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());

8460

if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,

8461

Depth + 1, ResolveKnownElts))

8462

return false;

8463

8464

// Subvector shuffle inputs must not be larger than the subvector.

8465

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

8466

return SubVT.getFixedSizeInBits() <

8467

SubInput.getValueSizeInBits().getFixedValue();

8468

}))

8469

return false;

8470

8471

if (SubMask.size() != NumSubElts) {

8472

assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8473, __extension__
__PRETTY_FUNCTION__))

8473

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8473, __extension__
__PRETTY_FUNCTION__));

8474

if ((NumSubElts % SubMask.size()) == 0) {

8475

int Scale = NumSubElts / SubMask.size();

8476

SmallVector<int,64> ScaledSubMask;

8477

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

8478

SubMask = ScaledSubMask;

8479

} else {

8480

int Scale = SubMask.size() / NumSubElts;

8481

NumSubElts = SubMask.size();

8482

NumElts *= Scale;

8483

InsertIdx *= Scale;

8484

}

8485

}

8486

Ops.push_back(Src);

8487

Ops.append(SubInputs.begin(), SubInputs.end());

8488

if (ISD::isBuildVectorAllZeros(Src.getNode()))

8489

Mask.append(NumElts, SM_SentinelZero);

8490

else

8491

for (int i = 0; i != (int)NumElts; ++i)

8492

Mask.push_back(i);

8493

for (int i = 0; i != (int)NumSubElts; ++i) {

8494

int M = SubMask[i];

8495

if (0 <= M) {

8496

int InputIdx = M / NumSubElts;

8497

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

8498

}

8499

Mask[i + InsertIdx] = M;

8500

}

8501

return true;

8502

}

8503

case X86ISD::PINSRB:

8504

case X86ISD::PINSRW:

8505

case ISD::SCALAR_TO_VECTOR:

8506

case ISD::INSERT_VECTOR_ELT: {

8507

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

8508

// vector, for matching src/dst vector types.

8509

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

8510

8511

unsigned DstIdx = 0;

8512

if (Opcode != ISD::SCALAR_TO_VECTOR) {

8513

// Check we have an in-range constant insertion index.

8514

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

8515

N.getConstantOperandAPInt(2).uge(NumElts))

8516

return false;

8517

DstIdx = N.getConstantOperandVal(2);

8518

8519

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

8520

if (X86::isZeroNode(Scl)) {

8521

Ops.push_back(N.getOperand(0));

8522

for (unsigned i = 0; i != NumElts; ++i)

8523

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

8524

return true;

8525

}

8526

}

8527

8528

// Peek through trunc/aext/zext.

8529

// TODO: aext shouldn't require SM_SentinelZero padding.

8530

// TODO: handle shift of scalars.

8531

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

8532

while (Scl.getOpcode() == ISD::TRUNCATE ||

8533

Scl.getOpcode() == ISD::ANY_EXTEND ||

8534

Scl.getOpcode() == ISD::ZERO_EXTEND) {

8535

Scl = Scl.getOperand(0);

8536

MinBitsPerElt =

8537

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

8538

}

8539

if ((MinBitsPerElt % 8) != 0)

8540

return false;

8541

8542

// Attempt to find the source vector the scalar was extracted from.

8543

SDValue SrcExtract;

8544

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

8545

Scl.getOpcode() == X86ISD::PEXTRW ||

8546

Scl.getOpcode() == X86ISD::PEXTRB) &&

8547

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

8548

SrcExtract = Scl;

8549

}

8550

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

8551

return false;

8552

8553

SDValue SrcVec = SrcExtract.getOperand(0);

8554

EVT SrcVT = SrcVec.getValueType();

8555

if (!SrcVT.getScalarType().isByteSized())

8556

return false;

8557

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

8558

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

8559

unsigned DstByte = DstIdx * NumBytesPerElt;

8560

MinBitsPerElt =

8561

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

8562

8563

// Create 'identity' byte level shuffle mask and then add inserted bytes.

8564

if (Opcode == ISD::SCALAR_TO_VECTOR) {

8565

Ops.push_back(SrcVec);

8566

Mask.append(NumSizeInBytes, SM_SentinelUndef);

8567

} else {

8568

Ops.push_back(SrcVec);

8569

Ops.push_back(N.getOperand(0));

8570

for (int i = 0; i != (int)NumSizeInBytes; ++i)

8571

Mask.push_back(NumSizeInBytes + i);

8572

}

8573

8574

unsigned MinBytesPerElts = MinBitsPerElt / 8;

8575

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

8576

for (unsigned i = 0; i != MinBytesPerElts; ++i)

8577

Mask[DstByte + i] = SrcByte + i;

8578

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

8579

Mask[DstByte + i] = SM_SentinelZero;

8580

return true;

8581

}

8582

case X86ISD::PACKSS:

8583

case X86ISD::PACKUS: {

8584

SDValue N0 = N.getOperand(0);

8585

SDValue N1 = N.getOperand(1);

8586

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8588, __extension__
__PRETTY_FUNCTION__))

8587

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8588, __extension__
__PRETTY_FUNCTION__))

8588

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8588, __extension__
__PRETTY_FUNCTION__));

8589

8590

APInt EltsLHS, EltsRHS;

8591

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

8592

8593

// If we know input saturation won't happen (or we don't care for particular

8594

// lanes), we can treat this as a truncation shuffle.

8595

bool Offset0 = false, Offset1 = false;

8596

if (Opcode == X86ISD::PACKSS) {

8597

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8598

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

8599

(!(N1.isUndef() || EltsRHS.isZero()) &&

8600

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

8601

return false;

8602

// We can't easily fold ASHR into a shuffle, but if it was feeding a

8603

// PACKSS then it was likely being used for sign-extension for a

8604

// truncation, so just peek through and adjust the mask accordingly.

8605

if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

8606

N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

8607

Offset0 = true;

8608

N0 = N0.getOperand(0);

8609

}

8610

if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

8611

N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

8612

Offset1 = true;

8613

N1 = N1.getOperand(0);

8614

}

8615

} else {

8616

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

8617

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8618

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

8619

(!(N1.isUndef() || EltsRHS.isZero()) &&

8620

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

8621

return false;

8622

}

8623

8624

bool IsUnary = (N0 == N1);

8625

8626

Ops.push_back(N0);

8627

if (!IsUnary)

8628

Ops.push_back(N1);

8629

8630

createPackShuffleMask(VT, Mask, IsUnary);

8631

8632

if (Offset0 || Offset1) {

8633

for (int &M : Mask)

8634

if ((Offset0 && isInRange(M, 0, NumElts)) ||

8635

(Offset1 && isInRange(M, NumElts, 2 * NumElts)))

8636

++M;

8637

}

8638

return true;

8639

}

8640

case ISD::VSELECT:

8641

case X86ISD::BLENDV: {

8642

SDValue Cond = N.getOperand(0);

8643

if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {

8644

Ops.push_back(N.getOperand(1));

8645

Ops.push_back(N.getOperand(2));

8646

return true;

8647

}

8648

return false;

8649

}

8650

case X86ISD::VTRUNC: {

8651

SDValue Src = N.getOperand(0);

8652

EVT SrcVT = Src.getValueType();

8653

// Truncated source must be a simple vector.

8654

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8655

(SrcVT.getScalarSizeInBits() % 8) != 0)

8656

return false;

8657

unsigned NumSrcElts = SrcVT.getVectorNumElements();

8658

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

8659

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

8660

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8660, __extension__
__PRETTY_FUNCTION__));

8661

for (unsigned i = 0; i != NumSrcElts; ++i)

8662

Mask.push_back(i * Scale);

8663

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

8664

Ops.push_back(Src);

8665

return true;

8666

}

8667

case X86ISD::VSHLI:

8668

case X86ISD::VSRLI: {

8669

uint64_t ShiftVal = N.getConstantOperandVal(1);

8670

// Out of range bit shifts are guaranteed to be zero.

8671

if (NumBitsPerElt <= ShiftVal) {

8672

Mask.append(NumElts, SM_SentinelZero);

8673

return true;

8674

}

8675

8676

// We can only decode 'whole byte' bit shifts as shuffles.

8677

if ((ShiftVal % 8) != 0)

8678

break;

8679

8680

uint64_t ByteShift = ShiftVal / 8;

8681

Ops.push_back(N.getOperand(0));

8682

8683

// Clear mask to all zeros and insert the shifted byte indices.

8684

Mask.append(NumSizeInBytes, SM_SentinelZero);

8685

8686

if (X86ISD::VSHLI == Opcode) {

8687

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8688

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8689

Mask[i + j] = i + j - ByteShift;

8690

} else {

8691

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8692

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8693

Mask[i + j - ByteShift] = i + j;

8694

}

8695

return true;

8696

}

8697

case X86ISD::VROTLI:

8698

case X86ISD::VROTRI: {

8699

// We can only decode 'whole byte' bit rotates as shuffles.

8700

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

8701

if ((RotateVal % 8) != 0)

8702

return false;

8703

Ops.push_back(N.getOperand(0));

8704

int Offset = RotateVal / 8;

8705

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

8706

for (int i = 0; i != (int)NumElts; ++i) {

8707

int BaseIdx = i * NumBytesPerElt;

8708

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

8709

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

8710

}

8711

}

8712

return true;

8713

}

8714

case X86ISD::VBROADCAST: {

8715

SDValue Src = N.getOperand(0);

8716

if (!Src.getSimpleValueType().isVector()) {

8717

if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8718

!isNullConstant(Src.getOperand(1)) ||

8719

Src.getOperand(0).getValueType().getScalarType() !=

8720

VT.getScalarType())

8721

return false;

8722

Src = Src.getOperand(0);

8723

}

8724

Ops.push_back(Src);

8725

Mask.append(NumElts, 0);

8726

return true;

8727

}

8728

case ISD::ZERO_EXTEND:

8729

case ISD::ANY_EXTEND:

8730

case ISD::ZERO_EXTEND_VECTOR_INREG:

8731

case ISD::ANY_EXTEND_VECTOR_INREG: {

8732

SDValue Src = N.getOperand(0);

8733

EVT SrcVT = Src.getValueType();

8734

8735

// Extended source must be a simple vector.

8736

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8737

(SrcVT.getScalarSizeInBits() % 8) != 0)

8738

return false;

8739

8740

bool IsAnyExtend =

8741

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

8742

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

8743

IsAnyExtend, Mask);

8744

Ops.push_back(Src);

8745

return true;

8746

}

8747

}

8748

8749

return false;

8750

}

8751

8752

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

8753

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

8754

SmallVectorImpl<int> &Mask) {

8755

int MaskWidth = Mask.size();

8756

SmallVector<SDValue, 16> UsedInputs;

8757

for (int i = 0, e = Inputs.size(); i < e; ++i) {

8758

int lo = UsedInputs.size() * MaskWidth;

8759

int hi = lo + MaskWidth;

8760

8761

// Strip UNDEF input usage.

8762

if (Inputs[i].isUndef())

8763

for (int &M : Mask)

8764

if ((lo <= M) && (M < hi))

8765

M = SM_SentinelUndef;

8766

8767

// Check for unused inputs.

8768

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

8769

for (int &M : Mask)

8770

if (lo <= M)

8771

M -= MaskWidth;

8772

continue;

8773

}

8774

8775

// Check for repeated inputs.

8776

bool IsRepeat = false;

8777

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

8778

if (UsedInputs[j] != Inputs[i])

8779

continue;

8780

for (int &M : Mask)

8781

if (lo <= M)

8782

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

8783

IsRepeat = true;

8784

break;

8785

}

8786

if (IsRepeat)

8787

continue;

8788

8789

UsedInputs.push_back(Inputs[i]);

8790

}

8791

Inputs = UsedInputs;

8792

}

8793

8794

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

8795

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

8796

/// Returns true if the target shuffle mask was decoded.

8797

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8798

SmallVectorImpl<SDValue> &Inputs,

8799

SmallVectorImpl<int> &Mask,

8800

APInt &KnownUndef, APInt &KnownZero,

8801

const SelectionDAG &DAG, unsigned Depth,

8802

bool ResolveKnownElts) {

8803

if (Depth >= SelectionDAG::MaxRecursionDepth)

8804

return false; // Limit search depth.

8805

8806

EVT VT = Op.getValueType();

8807

if (!VT.isSimple() || !VT.isVector())

8808

return false;

8809

8810

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

8811

if (ResolveKnownElts)

8812

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

8813

return true;

8814

}

8815

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

8816

ResolveKnownElts)) {

8817

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

8818

return true;

8819

}

8820

return false;

8821

}

8822

8823

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8824

SmallVectorImpl<SDValue> &Inputs,

8825

SmallVectorImpl<int> &Mask,

8826

const SelectionDAG &DAG, unsigned Depth,

8827

bool ResolveKnownElts) {

8828

APInt KnownUndef, KnownZero;

8829

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

8830

KnownZero, DAG, Depth, ResolveKnownElts);

8831

}

8832

8833

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

8834

SmallVectorImpl<int> &Mask,

8835

const SelectionDAG &DAG, unsigned Depth = 0,

8836

bool ResolveKnownElts = true) {

8837

EVT VT = Op.getValueType();

8838

if (!VT.isSimple() || !VT.isVector())

8839

return false;

8840

8841

unsigned NumElts = Op.getValueType().getVectorNumElements();

8842

APInt DemandedElts = APInt::getAllOnes(NumElts);

8843

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,

8844

ResolveKnownElts);

8845

}

8846

8847

// Attempt to create a scalar/subvector broadcast from the base MemSDNode.

8848

static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

8849

EVT MemVT, MemSDNode *Mem, unsigned Offset,

8850

SelectionDAG &DAG) {

8851

assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8853, __extension__
__PRETTY_FUNCTION__))

8852

Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8853, __extension__
__PRETTY_FUNCTION__))

8853

"Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8853, __extension__
__PRETTY_FUNCTION__));

8854

8855

// Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

8856

if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

8857

return SDValue();

8858

8859

SDValue Ptr =

8860

DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);

8861

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8862

SDValue Ops[] = {Mem->getChain(), Ptr};

8863

SDValue BcstLd = DAG.getMemIntrinsicNode(

8864

Opcode, DL, Tys, Ops, MemVT,

8865

DAG.getMachineFunction().getMachineMemOperand(

8866

Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

8867

DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

8868

return BcstLd;

8869

}

8870

8871

/// Returns the scalar element that will make up the i'th

8872

/// element of the result of the vector shuffle.

8873

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

8874

SelectionDAG &DAG, unsigned Depth) {

8875

if (Depth >= SelectionDAG::MaxRecursionDepth)

8876

return SDValue(); // Limit search depth.

8877

8878

EVT VT = Op.getValueType();

8879

unsigned Opcode = Op.getOpcode();

8880

unsigned NumElems = VT.getVectorNumElements();

8881

8882

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

8883

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

8884

int Elt = SV->getMaskElt(Index);

8885

8886

if (Elt < 0)

8887

return DAG.getUNDEF(VT.getVectorElementType());

8888

8889

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

8890

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8891

}

8892

8893

// Recurse into target specific vector shuffles to find scalars.

8894

if (isTargetShuffle(Opcode)) {

8895

MVT ShufVT = VT.getSimpleVT();

8896

MVT ShufSVT = ShufVT.getVectorElementType();

8897

int NumElems = (int)ShufVT.getVectorNumElements();

8898

SmallVector<int, 16> ShuffleMask;

8899

SmallVector<SDValue, 16> ShuffleOps;

8900

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

8901

ShuffleMask))

8902

return SDValue();

8903

8904

int Elt = ShuffleMask[Index];

8905

if (Elt == SM_SentinelZero)

8906

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

8907

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

8908

if (Elt == SM_SentinelUndef)

8909

return DAG.getUNDEF(ShufSVT);

8910

8911

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8911, __extension__
__PRETTY_FUNCTION__));

8912

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

8913

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8914

}

8915

8916

// Recurse into insert_subvector base/sub vector to find scalars.

8917

if (Opcode == ISD::INSERT_SUBVECTOR) {

8918

SDValue Vec = Op.getOperand(0);

8919

SDValue Sub = Op.getOperand(1);

8920

uint64_t SubIdx = Op.getConstantOperandVal(2);

8921

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

8922

8923

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

8924

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

8925

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

8926

}

8927

8928

// Recurse into concat_vectors sub vector to find scalars.

8929

if (Opcode == ISD::CONCAT_VECTORS) {

8930

EVT SubVT = Op.getOperand(0).getValueType();

8931

unsigned NumSubElts = SubVT.getVectorNumElements();

8932

uint64_t SubIdx = Index / NumSubElts;

8933

uint64_t SubElt = Index % NumSubElts;

8934

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

8935

}

8936

8937

// Recurse into extract_subvector src vector to find scalars.

8938

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

8939

SDValue Src = Op.getOperand(0);

8940

uint64_t SrcIdx = Op.getConstantOperandVal(1);

8941

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

8942

}

8943

8944

// We only peek through bitcasts of the same vector width.

8945

if (Opcode == ISD::BITCAST) {

8946

SDValue Src = Op.getOperand(0);

8947

EVT SrcVT = Src.getValueType();

8948

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

8949

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

8950

return SDValue();

8951

}

8952

8953

// Actual nodes that may contain scalar elements

8954

8955

// For insert_vector_elt - either return the index matching scalar or recurse

8956

// into the base vector.

8957

if (Opcode == ISD::INSERT_VECTOR_ELT &&

8958

isa<ConstantSDNode>(Op.getOperand(2))) {

8959

if (Op.getConstantOperandAPInt(2) == Index)

8960

return Op.getOperand(1);

8961

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

8962

}

8963

8964

if (Opcode == ISD::SCALAR_TO_VECTOR)

8965

return (Index == 0) ? Op.getOperand(0)

8966

: DAG.getUNDEF(VT.getVectorElementType());

8967

8968

if (Opcode == ISD::BUILD_VECTOR)

8969

return Op.getOperand(Index);

8970

8971

return SDValue();

8972

}

8973

8974

// Use PINSRB/PINSRW/PINSRD to create a build vector.

8975

static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,

8976

unsigned NumNonZero, unsigned NumZero,

8977

SelectionDAG &DAG,

8978

const X86Subtarget &Subtarget) {

8979

MVT VT = Op.getSimpleValueType();

8980

unsigned NumElts = VT.getVectorNumElements();

8981

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8983, __extension__
__PRETTY_FUNCTION__))

8982

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8983, __extension__
__PRETTY_FUNCTION__))

8983

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8983, __extension__
__PRETTY_FUNCTION__));

8984

8985

SDLoc dl(Op);

8986

SDValue V;

8987

bool First = true;

8988

8989

for (unsigned i = 0; i < NumElts; ++i) {

8990

bool IsNonZero = NonZeroMask[i];

8991

if (!IsNonZero)

8992

continue;

8993

8994

// If the build vector contains zeros or our first insertion is not the

8995

// first index then insert into zero vector to break any register

8996

// dependency else use SCALAR_TO_VECTOR.

8997

if (First) {

8998

First = false;

8999

if (NumZero || 0 != i)

9000

V = getZeroVector(VT, Subtarget, DAG, dl);

9001

else {

9002

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9002, __extension__
__PRETTY_FUNCTION__));

9003

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9004

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

9005

V = DAG.getBitcast(VT, V);

9006

continue;

9007

}

9008

}

9009

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

9010

DAG.getIntPtrConstant(i, dl));

9011

}

9012

9013

return V;

9014

}

9015

9016

/// Custom lower build_vector of v16i8.

9017

static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,

9018

unsigned NumNonZero, unsigned NumZero,

9019

SelectionDAG &DAG,

9020

const X86Subtarget &Subtarget) {

9021

if (NumNonZero > 8 && !Subtarget.hasSSE41())

9022

return SDValue();

9023

9024

// SSE4.1 - use PINSRB to insert each byte directly.

9025

if (Subtarget.hasSSE41())

9026

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9027

Subtarget);

9028

9029

SDLoc dl(Op);

9030

SDValue V;

9031

9032

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

9033

for (unsigned i = 0; i < 16; i += 2) {

9034

bool ThisIsNonZero = NonZeroMask[i];

9035

bool NextIsNonZero = NonZeroMask[i + 1];

9036

if (!ThisIsNonZero && !NextIsNonZero)

9037

continue;

9038

9039

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

9040

SDValue Elt;

9041

if (ThisIsNonZero) {

9042

if (NumZero || NextIsNonZero)

9043

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9044

else

9045

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9046

}

9047

9048

if (NextIsNonZero) {

9049

SDValue NextElt = Op.getOperand(i + 1);

9050

if (i == 0 && NumZero)

9051

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

9052

else

9053

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

9054

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

9055

DAG.getConstant(8, dl, MVT::i8));

9056

if (ThisIsNonZero)

9057

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

9058

else

9059

Elt = NextElt;

9060

}

9061

9062

// If our first insertion is not the first index or zeros are needed, then

9063

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

9064

// elements undefined).

9065

if (!V) {

9066

if (i != 0 || NumZero)

9067

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

9068

else {

9069

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

9070

V = DAG.getBitcast(MVT::v8i16, V);

9071

continue;

9072

}

9073

}

9074

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

9075

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

9076

DAG.getIntPtrConstant(i / 2, dl));

9077

}

9078

9079

return DAG.getBitcast(MVT::v16i8, V);

9080

}

9081

9082

/// Custom lower build_vector of v8i16.

9083

static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,

9084

unsigned NumNonZero, unsigned NumZero,

9085

SelectionDAG &DAG,

9086

const X86Subtarget &Subtarget) {

9087

if (NumNonZero > 4 && !Subtarget.hasSSE41())

9088

return SDValue();

9089

9090

// Use PINSRW to insert each byte directly.

9091

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9092

Subtarget);

9093

}

9094

9095

/// Custom lower build_vector of v4i32 or v4f32.

9096

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

9097

const X86Subtarget &Subtarget) {

9098

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

9099

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

9100

// Because we're creating a less complicated build vector here, we may enable

9101

// further folding of the MOVDDUP via shuffle transforms.

9102

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

9103

Op.getOperand(0) == Op.getOperand(2) &&

9104

Op.getOperand(1) == Op.getOperand(3) &&

9105

Op.getOperand(0) != Op.getOperand(1)) {

9106

SDLoc DL(Op);

9107

MVT VT = Op.getSimpleValueType();

9108

MVT EltVT = VT.getVectorElementType();

9109

// Create a new build vector with the first 2 elements followed by undef

9110

// padding, bitcast to v2f64, duplicate, and bitcast back.

9111

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

9112

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

9113

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

9114

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

9115

return DAG.getBitcast(VT, Dup);

9116

}

9117

9118

// Find all zeroable elements.

9119

std::bitset<4> Zeroable, Undefs;

9120

for (int i = 0; i < 4; ++i) {

9121

SDValue Elt = Op.getOperand(i);

9122

Undefs[i] = Elt.isUndef();

9123

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

9124

}

9125

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9126, __extension__
__PRETTY_FUNCTION__))

9126

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9126, __extension__
__PRETTY_FUNCTION__));

9127

9128

// We only know how to deal with build_vector nodes where elements are either

9129

// zeroable or extract_vector_elt with constant index.

9130

SDValue FirstNonZero;

9131

unsigned FirstNonZeroIdx;

9132

for (unsigned i = 0; i < 4; ++i) {

9133

if (Zeroable[i])

9134

continue;

9135

SDValue Elt = Op.getOperand(i);

9136

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9137

!isa<ConstantSDNode>(Elt.getOperand(1)))

9138

return SDValue();

9139

// Make sure that this node is extracting from a 128-bit vector.

9140

MVT VT = Elt.getOperand(0).getSimpleValueType();

9141

if (!VT.is128BitVector())

9142

return SDValue();

9143

if (!FirstNonZero.getNode()) {

9144

FirstNonZero = Elt;

9145

FirstNonZeroIdx = i;

9146

}

9147

}

9148

9149

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9149, __extension__
__PRETTY_FUNCTION__));

9150

SDValue V1 = FirstNonZero.getOperand(0);

9151

MVT VT = V1.getSimpleValueType();

9152

9153

// See if this build_vector can be lowered as a blend with zero.

9154

SDValue Elt;

9155

unsigned EltMaskIdx, EltIdx;

9156

int Mask[4];

9157

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

9158

if (Zeroable[EltIdx]) {

9159

// The zero vector will be on the right hand side.

9160

Mask[EltIdx] = EltIdx+4;

9161

continue;

9162

}

9163

9164

Elt = Op->getOperand(EltIdx);

9165

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

9166

EltMaskIdx = Elt.getConstantOperandVal(1);

9167

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

9168

break;

9169

Mask[EltIdx] = EltIdx;

9170

}

9171

9172

if (EltIdx == 4) {

9173

// Let the shuffle legalizer deal with blend operations.

9174

SDValue VZeroOrUndef = (Zeroable == Undefs)

9175

? DAG.getUNDEF(VT)

9176

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

9177

if (V1.getSimpleValueType() != VT)

9178

V1 = DAG.getBitcast(VT, V1);

9179

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

9180

}

9181

9182

// See if we can lower this build_vector to a INSERTPS.

9183

if (!Subtarget.hasSSE41())

9184

return SDValue();

9185

9186

SDValue V2 = Elt.getOperand(0);

9187

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

9188

V1 = SDValue();

9189

9190

bool CanFold = true;

9191

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

9192

if (Zeroable[i])

9193

continue;

9194

9195

SDValue Current = Op->getOperand(i);

9196

SDValue SrcVector = Current->getOperand(0);

9197

if (!V1.getNode())

9198

V1 = SrcVector;

9199

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

9200

}

9201

9202

if (!CanFold)

9203

return SDValue();

9204

9205

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9205, __extension__
__PRETTY_FUNCTION__));

9206

if (V1.getSimpleValueType() != MVT::v4f32)

9207

V1 = DAG.getBitcast(MVT::v4f32, V1);

9208

if (V2.getSimpleValueType() != MVT::v4f32)

9209

V2 = DAG.getBitcast(MVT::v4f32, V2);

9210

9211

// Ok, we can emit an INSERTPS instruction.

9212

unsigned ZMask = Zeroable.to_ulong();

9213

9214

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

9215

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9215, __extension__
__PRETTY_FUNCTION__));

9216

SDLoc DL(Op);

9217

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

9218

DAG.getIntPtrConstant(InsertPSMask, DL, true));

9219

return DAG.getBitcast(VT, Result);

9220

}

9221

9222

/// Return a vector logical shift node.

9223

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

9224

SelectionDAG &DAG, const TargetLowering &TLI,

9225

const SDLoc &dl) {

9226

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9226, __extension__
__PRETTY_FUNCTION__));

9227

MVT ShVT = MVT::v16i8;

9228

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

9229

SrcOp = DAG.getBitcast(ShVT, SrcOp);

9230

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9230, __extension__
__PRETTY_FUNCTION__));

9231

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

9232

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

9233

}

9234

9235

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

9236

SelectionDAG &DAG) {

9237

9238

// Check if the scalar load can be widened into a vector load. And if

9239

// the address is "base + cst" see if the cst can be "absorbed" into

9240

// the shuffle mask.

9241

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

9242

SDValue Ptr = LD->getBasePtr();

9243

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

9244

return SDValue();

9245

EVT PVT = LD->getValueType(0);

9246

if (PVT != MVT::i32 && PVT != MVT::f32)

9247

return SDValue();

9248

9249

int FI = -1;

9250

int64_t Offset = 0;

9251

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

9252

FI = FINode->getIndex();

9253

Offset = 0;

9254

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

9255

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

9256

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

9257

Offset = Ptr.getConstantOperandVal(1);

9258

Ptr = Ptr.getOperand(0);

9259

} else {

9260

return SDValue();

9261

}

9262

9263

// FIXME: 256-bit vector instructions don't require a strict alignment,

9264

// improve this code to support it better.

9265

Align RequiredAlign(VT.getSizeInBits() / 8);

9266

SDValue Chain = LD->getChain();

9267

// Make sure the stack object alignment is at least 16 or 32.

9268

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

9269

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

9270

if (!InferredAlign || *InferredAlign < RequiredAlign) {

9271

if (MFI.isFixedObjectIndex(FI)) {

9272

// Can't change the alignment. FIXME: It's possible to compute

9273

// the exact stack offset and reference FI + adjust offset instead.

9274

// If someone *really* cares about this. That's the way to implement it.

9275

return SDValue();

9276

} else {

9277

MFI.setObjectAlignment(FI, RequiredAlign);

9278

}

9279

}

9280

9281

// (Offset % 16 or 32) must be multiple of 4. Then address is then

9282

// Ptr + (Offset & ~15).

9283

if (Offset < 0)

9284

return SDValue();

9285

if ((Offset % RequiredAlign.value()) & 3)

9286

return SDValue();

9287

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

9288

if (StartOffset) {

9289

SDLoc DL(Ptr);

9290

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

9291

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

9292

}

9293

9294

int EltNo = (Offset - StartOffset) >> 2;

9295

unsigned NumElems = VT.getVectorNumElements();

9296

9297

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

9298

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

9299

LD->getPointerInfo().getWithOffset(StartOffset));

9300

9301

SmallVector<int, 8> Mask(NumElems, EltNo);

9302

9303

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

9304

}

9305

9306

return SDValue();

9307

}

9308

9309

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

9310

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

9311

if (ISD::isNON_EXTLoad(Elt.getNode())) {

9312

auto *BaseLd = cast<LoadSDNode>(Elt);

9313

if (!BaseLd->isSimple())

9314

return false;

9315

Ld = BaseLd;

9316

ByteOffset = 0;

9317

return true;

9318

}

9319

9320

switch (Elt.getOpcode()) {

9321

case ISD::BITCAST:

9322

case ISD::TRUNCATE:

9323

case ISD::SCALAR_TO_VECTOR:

9324

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

9325

case ISD::SRL:

9326

if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9327

uint64_t Amt = AmtC->getZExtValue();

9328

if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

9329

ByteOffset += Amt / 8;

9330

return true;

9331

}

9332

}

9333

break;

9334

case ISD::EXTRACT_VECTOR_ELT:

9335

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9336

SDValue Src = Elt.getOperand(0);

9337

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

9338

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

9339

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

9340

findEltLoadSrc(Src, Ld, ByteOffset)) {

9341

uint64_t Idx = IdxC->getZExtValue();

9342

ByteOffset += Idx * (SrcSizeInBits / 8);

9343

return true;

9344

}

9345

}

9346

break;

9347

}

9348

9349

return false;

9350

}

9351

9352

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

9353

/// elements can be replaced by a single large load which has the same value as

9354

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

9355

///

9356

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

9357

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

9358

const SDLoc &DL, SelectionDAG &DAG,

9359

const X86Subtarget &Subtarget,

9360

bool IsAfterLegalize) {

9361

if ((VT.getScalarSizeInBits() % 8) != 0)

9362

return SDValue();

9363

9364

unsigned NumElems = Elts.size();

9365

9366

int LastLoadedElt = -1;

9367

APInt LoadMask = APInt::getZero(NumElems);

9368

APInt ZeroMask = APInt::getZero(NumElems);

9369

APInt UndefMask = APInt::getZero(NumElems);

9370

9371

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

9372

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

9373

9374

// For each element in the initializer, see if we've found a load, zero or an

9375

// undef.

9376

for (unsigned i = 0; i < NumElems; ++i) {

9377

SDValue Elt = peekThroughBitcasts(Elts[i]);

9378

if (!Elt.getNode())

9379

return SDValue();

9380

if (Elt.isUndef()) {

9381

UndefMask.setBit(i);

9382

continue;

9383

}

9384

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

9385

ZeroMask.setBit(i);

9386

continue;

9387

}

9388

9389

// Each loaded element must be the correct fractional portion of the

9390

// requested vector load.

9391

unsigned EltSizeInBits = Elt.getValueSizeInBits();

9392

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

9393

return SDValue();

9394

9395

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

9396

return SDValue();

9397

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

9398

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

9399

return SDValue();

9400

9401

LoadMask.setBit(i);

9402

LastLoadedElt = i;

9403

}

9404

assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9406, __extension__
__PRETTY_FUNCTION__))

9405

NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9406, __extension__
__PRETTY_FUNCTION__))

9406

"Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9406, __extension__
__PRETTY_FUNCTION__));

9407

9408

// Handle Special Cases - all undef or undef/zero.

9409

if (UndefMask.popcount() == NumElems)

9410

return DAG.getUNDEF(VT);

9411

if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)

9412

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

9413

: DAG.getConstantFP(0.0, DL, VT);

9414

9415

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9416

int FirstLoadedElt = LoadMask.countr_zero();

9417

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

9418

EVT EltBaseVT = EltBase.getValueType();

9419

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9420, __extension__
__PRETTY_FUNCTION__))

9420

"Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9420, __extension__
__PRETTY_FUNCTION__));

9421

LoadSDNode *LDBase = Loads[FirstLoadedElt];

9422

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9422, __extension__
__PRETTY_FUNCTION__));

9423

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

9424

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

9425

int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

9426

int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

9427

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9427, __extension__
__PRETTY_FUNCTION__));

9428

9429

// TODO: Support offsetting the base load.

9430

if (ByteOffsets[FirstLoadedElt] != 0)

9431

return SDValue();

9432

9433

// Check to see if the element's load is consecutive to the base load

9434

// or offset from a previous (already checked) load.

9435

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

9436

LoadSDNode *Ld = Loads[EltIdx];

9437

int64_t ByteOffset = ByteOffsets[EltIdx];

9438

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

9439

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

9440

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

9441

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

9442

}

9443

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

9444

EltIdx - FirstLoadedElt);

9445

};

9446

9447

// Consecutive loads can contain UNDEFS but not ZERO elements.

9448

// Consecutive loads with UNDEFs and ZEROs elements require a

9449

// an additional shuffle stage to clear the ZERO elements.

9450

bool IsConsecutiveLoad = true;

9451

bool IsConsecutiveLoadWithZeros = true;

9452

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

9453

if (LoadMask[i]) {

9454

if (!CheckConsecutiveLoad(LDBase, i)) {

9455

IsConsecutiveLoad = false;

9456

IsConsecutiveLoadWithZeros = false;

9457

break;

9458

}

9459

} else if (ZeroMask[i]) {

9460

IsConsecutiveLoad = false;

9461

}

9462

}

9463

9464

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

9465

auto MMOFlags = LDBase->getMemOperand()->getFlags();

9466

assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9467, __extension__
__PRETTY_FUNCTION__))

9467

"Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9467, __extension__
__PRETTY_FUNCTION__));

9468

SDValue NewLd =

9469

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

9470

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

9471

MMOFlags);

9472

for (auto *LD : Loads)

9473

if (LD)

9474

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

9475

return NewLd;

9476

};

9477

9478

// Check if the base load is entirely dereferenceable.

9479

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

9480

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

9481

9482

// LOAD - all consecutive load/undefs (must start/end with a load or be

9483

// entirely dereferenceable). If we have found an entire vector of loads and

9484

// undefs, then return a large load of the entire vector width starting at the

9485

// base pointer. If the vector contains zeros, then attempt to shuffle those

9486

// elements.

9487

if (FirstLoadedElt == 0 &&

9488

(NumLoadedElts == (int)NumElems || IsDereferenceable) &&

9489

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

9490

if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

9491

return SDValue();

9492

9493

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

9494

// will lower to regular temporal loads and use the cache.

9495

if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&

9496

VT.is256BitVector() && !Subtarget.hasInt256())

9497

return SDValue();

9498

9499

if (NumElems == 1)

9500

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

9501

9502

if (!ZeroMask)

9503

return CreateLoad(VT, LDBase);

9504

9505

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

9506

// vector and a zero vector to clear out the zero elements.

9507

if (!IsAfterLegalize && VT.isVector()) {

9508

unsigned NumMaskElts = VT.getVectorNumElements();

9509

if ((NumMaskElts % NumElems) == 0) {

9510

unsigned Scale = NumMaskElts / NumElems;

9511

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

9512

for (unsigned i = 0; i < NumElems; ++i) {

9513

if (UndefMask[i])

9514

continue;

9515

int Offset = ZeroMask[i] ? NumMaskElts : 0;

9516

for (unsigned j = 0; j != Scale; ++j)

9517

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

9518

}

9519

SDValue V = CreateLoad(VT, LDBase);

9520

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

9521

: DAG.getConstantFP(0.0, DL, VT);

9522

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

9523

}

9524

}

9525

}

9526

9527

// If the upper half of a ymm/zmm load is undef then just load the lower half.

9528

if (VT.is256BitVector() || VT.is512BitVector()) {

9529

unsigned HalfNumElems = NumElems / 2;

9530

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

9531

EVT HalfVT =

9532

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

9533

SDValue HalfLD =

9534

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

9535

DAG, Subtarget, IsAfterLegalize);

9536

if (HalfLD)

9537

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

9538

HalfLD, DAG.getIntPtrConstant(0, DL));

9539

}

9540

}

9541

9542

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

9543

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

9544

((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

9545

LoadSizeInBits == 64) &&

9546

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

9547

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

9548

: MVT::getIntegerVT(LoadSizeInBits);

9549

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

9550

// Allow v4f32 on SSE1 only targets.

9551

// FIXME: Add more isel patterns so we can just use VT directly.

9552

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

9553

VecVT = MVT::v4f32;

9554

if (TLI.isTypeLegal(VecVT)) {

9555

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

9556

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

9557

SDValue ResNode = DAG.getMemIntrinsicNode(

9558

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

9559

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

9560

for (auto *LD : Loads)

9561

if (LD)

9562

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

9563

return DAG.getBitcast(VT, ResNode);

9564

}

9565

}

9566

9567

// BROADCAST - match the smallest possible repetition pattern, load that

9568

// scalar/subvector element and then broadcast to the entire vector.

9569

if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

9570

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

9571

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

9572

unsigned RepeatSize = SubElems * BaseSizeInBits;

9573

unsigned ScalarSize = std::min(RepeatSize, 64u);

9574

if (!Subtarget.hasAVX2() && ScalarSize < 32)

9575

continue;

9576

9577

// Don't attempt a 1:N subvector broadcast - it should be caught by

9578

// combineConcatVectorOps, else will cause infinite loops.

9579

if (RepeatSize > ScalarSize && SubElems == 1)

9580

continue;

9581

9582

bool Match = true;

9583

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

9584

for (unsigned i = 0; i != NumElems && Match; ++i) {

9585

if (!LoadMask[i])

9586

continue;

9587

SDValue Elt = peekThroughBitcasts(Elts[i]);

9588

if (RepeatedLoads[i % SubElems].isUndef())

9589

RepeatedLoads[i % SubElems] = Elt;

9590

else

9591

Match &= (RepeatedLoads[i % SubElems] == Elt);

9592

}

9593

9594

// We must have loads at both ends of the repetition.

9595

Match &= !RepeatedLoads.front().isUndef();

9596

Match &= !RepeatedLoads.back().isUndef();

9597

if (!Match)

9598

continue;

9599

9600

EVT RepeatVT =

9601

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

9602

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

9603

: EVT::getFloatingPointVT(ScalarSize);

9604

if (RepeatSize > ScalarSize)

9605

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

9606

RepeatSize / ScalarSize);

9607

EVT BroadcastVT =

9608

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

9609

VT.getSizeInBits() / ScalarSize);

9610

if (TLI.isTypeLegal(BroadcastVT)) {

9611

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

9612

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

9613

SDValue Broadcast = RepeatLoad;

9614

if (RepeatSize > ScalarSize) {

9615

while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

9616

Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

9617

} else {

9618

if (!Subtarget.hasAVX2() &&

9619

!X86::mayFoldLoadIntoBroadcastFromMem(

9620

RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

9621

Subtarget,

9622

/*AssumeSingleUse=*/true))

9623

return SDValue();

9624

Broadcast =

9625

DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

9626

}

9627

return DAG.getBitcast(VT, Broadcast);

9628

}

9629

}

9630

}

9631

}

9632

9633

return SDValue();

9634

}

9635

9636

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

9637

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

9638

// are consecutive, non-overlapping, and in the right order.

9639

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

9640

SelectionDAG &DAG,

9641

const X86Subtarget &Subtarget,

9642

bool IsAfterLegalize) {

9643

SmallVector<SDValue, 64> Elts;

9644

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

9645

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

9646

Elts.push_back(Elt);

9647

continue;

9648

}

9649

return SDValue();

9650

}

9651

assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9651, __extension__
__PRETTY_FUNCTION__));

9652

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

9653

IsAfterLegalize);

9654

}

9655

9656

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

9657

unsigned SplatBitSize, LLVMContext &C) {

9658

unsigned ScalarSize = VT.getScalarSizeInBits();

9659

unsigned NumElm = SplatBitSize / ScalarSize;

9660

9661

SmallVector<Constant *, 32> ConstantVec;

9662

for (unsigned i = 0; i < NumElm; i++) {

9663

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

9664

Constant *Const;

9665

if (VT.isFloatingPoint()) {

9666

if (ScalarSize == 16) {

9667

Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

9668

} else if (ScalarSize == 32) {

9669

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

9670

} else {

9671

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9671, __extension__
__PRETTY_FUNCTION__));

9672

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

9673

}

9674

} else

9675

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

9676

ConstantVec.push_back(Const);

9677

}

9678

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

9679

}

9680

9681

static bool isFoldableUseOfShuffle(SDNode *N) {

9682

for (auto *U : N->uses()) {

9683

unsigned Opc = U->getOpcode();

9684

// VPERMV/VPERMV3 shuffles can never fold their index operands.

9685

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

9686

return false;

9687

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

9688

return false;

9689

if (isTargetShuffle(Opc))

9690

return true;

9691

if (Opc == ISD::BITCAST) // Ignore bitcasts

9692

return isFoldableUseOfShuffle(U);

9693

if (N->hasOneUse()) {

9694

// TODO, there may be some general way to know if a SDNode can

9695

// be folded. We now only know whether an MI is foldable.

9696

if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

9697

return false;

9698

return true;

9699

}

9700

}

9701

return false;

9702

}

9703

9704

/// Attempt to use the vbroadcast instruction to generate a splat value

9705

/// from a splat BUILD_VECTOR which uses:

9706

/// a. A single scalar load, or a constant.

9707

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

9708

///

9709

/// The VBROADCAST node is returned when a pattern is found,

9710

/// or SDValue() otherwise.

9711

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

9712

const X86Subtarget &Subtarget,

9713

SelectionDAG &DAG) {

9714

// VBROADCAST requires AVX.

9715

// TODO: Splats could be generated for non-AVX CPUs using SSE

9716

// instructions, but there's less potential gain for only 128-bit vectors.

9717

if (!Subtarget.hasAVX())

9718

return SDValue();

9719

9720

MVT VT = BVOp->getSimpleValueType(0);

9721

unsigned NumElts = VT.getVectorNumElements();

9722

SDLoc dl(BVOp);

9723

9724

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9725, __extension__
__PRETTY_FUNCTION__))

9725

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9725, __extension__
__PRETTY_FUNCTION__));

9726

9727

// See if the build vector is a repeating sequence of scalars (inc. splat).

9728

SDValue Ld;

9729

BitVector UndefElements;

9730

SmallVector<SDValue, 16> Sequence;

9731

if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

9732

assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9732, __extension__
__PRETTY_FUNCTION__));

9733

if (Sequence.size() == 1)

9734

Ld = Sequence[0];

9735

}

9736

9737

// Attempt to use VBROADCASTM

9738

// From this pattern:

9739

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

9740

// b. t1 = (build_vector t0 t0)

9741

//

9742

// Create (VBROADCASTM v2i1 X)

9743

if (!Sequence.empty() && Subtarget.hasCDI()) {

9744

// If not a splat, are the upper sequence values zeroable?

9745

unsigned SeqLen = Sequence.size();

9746

bool UpperZeroOrUndef =

9747

SeqLen == 1 ||

9748

llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {

9749

return !V || V.isUndef() || isNullConstant(V);

9750

});

9751

SDValue Op0 = Sequence[0];

9752

if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

9753

(Op0.getOpcode() == ISD::ZERO_EXTEND &&

9754

Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

9755

SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

9756

? Op0.getOperand(0)

9757

: Op0.getOperand(0).getOperand(0);

9758

MVT MaskVT = BOperand.getSimpleValueType();

9759

MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

9760

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

9761

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

9762

MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

9763

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

9764

unsigned Scale = 512 / VT.getSizeInBits();

9765

BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

9766

}

9767

SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

9768

if (BcstVT.getSizeInBits() != VT.getSizeInBits())

9769

Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

9770

return DAG.getBitcast(VT, Bcst);

9771

}

9772

}

9773

}

9774

9775

unsigned NumUndefElts = UndefElements.count();

9776

if (!Ld || (NumElts - NumUndefElts) <= 1) {

9777

APInt SplatValue, Undef;

9778

unsigned SplatBitSize;

9779

bool HasUndef;

9780

// Check if this is a repeated constant pattern suitable for broadcasting.

9781

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

9782

SplatBitSize > VT.getScalarSizeInBits() &&

9783

SplatBitSize < VT.getSizeInBits()) {

9784

// Avoid replacing with broadcast when it's a use of a shuffle

9785

// instruction to preserve the present custom lowering of shuffles.

9786

if (isFoldableUseOfShuffle(BVOp))

9787

return SDValue();

9788

// replace BUILD_VECTOR with broadcast of the repeated constants.

9789

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9790

LLVMContext *Ctx = DAG.getContext();

9791

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

9792

if (Subtarget.hasAVX()) {

9793

if (SplatBitSize == 32 || SplatBitSize == 64 ||

9794

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

9795

// Splatted value can fit in one INTEGER constant in constant pool.

9796

// Load the constant and broadcast it.

9797

MVT CVT = MVT::getIntegerVT(SplatBitSize);

9798

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

9799

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

9800

SDValue CP = DAG.getConstantPool(C, PVT);

9801

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

9802

9803

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9804

SDVTList Tys =

9805

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

9806

SDValue Ops[] = {DAG.getEntryNode(), CP};

9807

MachinePointerInfo MPI =

9808

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9809

SDValue Brdcst = DAG.getMemIntrinsicNode(

9810

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

9811

MachineMemOperand::MOLoad);

9812

return DAG.getBitcast(VT, Brdcst);

9813

}

9814

if (SplatBitSize > 64) {

9815

// Load the vector of constants and broadcast it.

9816

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

9817

*Ctx);

9818

SDValue VCP = DAG.getConstantPool(VecC, PVT);

9819

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

9820

MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

9821

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

9822

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9823

SDValue Ops[] = {DAG.getEntryNode(), VCP};

9824

MachinePointerInfo MPI =

9825

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9826

return DAG.getMemIntrinsicNode(

9827

X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,

9828

MachineMemOperand::MOLoad);

9829

}

9830

}

9831

}

9832

9833

// If we are moving a scalar into a vector (Ld must be set and all elements

9834

// but 1 are undef) and that operation is not obviously supported by

9835

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

9836

// That's better than general shuffling and may eliminate a load to GPR and

9837

// move from scalar to vector register.

9838

if (!Ld || NumElts - NumUndefElts != 1)

9839

return SDValue();

9840

unsigned ScalarSize = Ld.getValueSizeInBits();

9841

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

9842

return SDValue();

9843

}

9844

9845

bool ConstSplatVal =

9846

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

9847

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

9848

9849

// TODO: Handle broadcasts of non-constant sequences.

9850

9851

// Make sure that all of the users of a non-constant load are from the

9852

// BUILD_VECTOR node.

9853

// FIXME: Is the use count needed for non-constant, non-load case?

9854

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

9855

return SDValue();

9856

9857

unsigned ScalarSize = Ld.getValueSizeInBits();

9858

bool IsGE256 = (VT.getSizeInBits() >= 256);

9859

9860

// When optimizing for size, generate up to 5 extra bytes for a broadcast

9861

// instruction to save 8 or more bytes of constant pool data.

9862

// TODO: If multiple splats are generated to load the same constant,

9863

// it may be detrimental to overall size. There needs to be a way to detect

9864

// that condition to know if this is truly a size win.

9865

bool OptForSize = DAG.shouldOptForSize();

9866

9867

// Handle broadcasting a single constant scalar from the constant pool

9868

// into a vector.

9869

// On Sandybridge (no AVX2), it is still better to load a constant vector

9870

// from the constant pool and not to broadcast it from a scalar.

9871

// But override that restriction when optimizing for size.

9872

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

9873

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

9874

EVT CVT = Ld.getValueType();

9875

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9875, __extension__
__PRETTY_FUNCTION__));

9876

9877

// Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.

9878

// For size optimization, also splat v2f64 and v2i64, and for size opt

9879

// with AVX2, also splat i8 and i16.

9880

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

9881

if (ScalarSize == 32 ||

9882

(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

9883

CVT == MVT::f16 ||

9884

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

9885

const Constant *C = nullptr;

9886

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

9887

C = CI->getConstantIntValue();

9888

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

9889

C = CF->getConstantFPValue();

9890

9891

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9891, __extension__
__PRETTY_FUNCTION__));

9892

9893

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9894

SDValue CP =

9895

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

9896

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9897

9898

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9899

SDValue Ops[] = {DAG.getEntryNode(), CP};

9900

MachinePointerInfo MPI =

9901

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9902

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

9903

MPI, Alignment, MachineMemOperand::MOLoad);

9904

}

9905

}

9906

9907

// Handle AVX2 in-register broadcasts.

9908

if (!IsLoad && Subtarget.hasInt256() &&

9909

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

9910

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9911

9912

// The scalar source must be a normal load.

9913

if (!IsLoad)

9914

return SDValue();

9915

9916

// Make sure the non-chain result is only used by this build vector.

9917

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

9918

return SDValue();

9919

9920

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

9921

(Subtarget.hasVLX() && ScalarSize == 64)) {

9922

auto *LN = cast<LoadSDNode>(Ld);

9923

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9924

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9925

SDValue BCast =

9926

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9927

LN->getMemoryVT(), LN->getMemOperand());

9928

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9929

return BCast;

9930

}

9931

9932

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

9933

// double since there is no vbroadcastsd xmm

9934

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

9935

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

9936

auto *LN = cast<LoadSDNode>(Ld);

9937

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9938

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9939

SDValue BCast =

9940

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9941

LN->getMemoryVT(), LN->getMemOperand());

9942

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9943

return BCast;

9944

}

9945

9946

if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

9947

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9948

9949

// Unsupported broadcast.

9950

return SDValue();

9951

}

9952

9953

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

9954

/// underlying vector and index.

9955

///

9956

/// Modifies \p ExtractedFromVec to the real vector and returns the real

9957

/// index.

9958

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

9959

SDValue ExtIdx) {

9960

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

9961

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

9962

return Idx;

9963

9964

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

9965

// lowered this:

9966

// (extract_vector_elt (v8f32 %1), Constant<6>)

9967

// to:

9968

// (extract_vector_elt (vector_shuffle<2,u,u,u>

9969

// (extract_subvector (v8f32 %0), Constant<4>),

9970

// undef)

9971

// Constant<0>)

9972

// In this case the vector is the extract_subvector expression and the index

9973

// is 2, as specified by the shuffle.

9974

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

9975

SDValue ShuffleVec = SVOp->getOperand(0);

9976

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

9977

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9978, __extension__
__PRETTY_FUNCTION__))

9978

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9978, __extension__
__PRETTY_FUNCTION__));

9979

9980

int ShuffleIdx = SVOp->getMaskElt(Idx);

9981

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

9982

ExtractedFromVec = ShuffleVec;

9983

return ShuffleIdx;

9984

}

9985

return Idx;

9986

}

9987

9988

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

9989

MVT VT = Op.getSimpleValueType();

9990

9991

// Skip if insert_vec_elt is not supported.

9992

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9993

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

9994

return SDValue();

9995

9996

SDLoc DL(Op);

9997

unsigned NumElems = Op.getNumOperands();

9998

9999

SDValue VecIn1;

10000

SDValue VecIn2;

10001

SmallVector<unsigned, 4> InsertIndices;

10002

SmallVector<int, 8> Mask(NumElems, -1);

10003

10004

for (unsigned i = 0; i != NumElems; ++i) {

10005

unsigned Opc = Op.getOperand(i).getOpcode();

10006

10007

if (Opc == ISD::UNDEF)

10008

continue;

10009

10010

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

10011

// Quit if more than 1 elements need inserting.

10012

if (InsertIndices.size() > 1)

10013

return SDValue();

10014

10015

InsertIndices.push_back(i);

10016

continue;

10017

}

10018

10019

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

10020

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

10021

10022

// Quit if non-constant index.

10023

if (!isa<ConstantSDNode>(ExtIdx))

10024

return SDValue();

10025

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

10026

10027

// Quit if extracted from vector of different type.

10028

if (ExtractedFromVec.getValueType() != VT)

10029

return SDValue();

10030

10031

if (!VecIn1.getNode())

10032

VecIn1 = ExtractedFromVec;

10033

else if (VecIn1 != ExtractedFromVec) {

10034

if (!VecIn2.getNode())

10035

VecIn2 = ExtractedFromVec;

10036

else if (VecIn2 != ExtractedFromVec)

10037

// Quit if more than 2 vectors to shuffle

10038

return SDValue();

10039

}

10040

10041

if (ExtractedFromVec == VecIn1)

10042

Mask[i] = Idx;

10043

else if (ExtractedFromVec == VecIn2)

10044

Mask[i] = Idx + NumElems;

10045

}

10046

10047

if (!VecIn1.getNode())

10048

return SDValue();

10049

10050

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

10051

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

10052

10053

for (unsigned Idx : InsertIndices)

10054

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

10055

DAG.getIntPtrConstant(Idx, DL));

10056

10057

return NV;

10058

}

10059

10060

// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.

10061

static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,

10062

const X86Subtarget &Subtarget) {

10063

MVT VT = Op.getSimpleValueType();

10064

MVT IVT = VT.changeVectorElementTypeToInteger();

10065

SmallVector<SDValue, 16> NewOps;

10066

for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)

10067

NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));

10068

SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);

10069

return DAG.getBitcast(VT, Res);

10070

}

10071

10072

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

10073

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

10074

const X86Subtarget &Subtarget) {

10075

10076

MVT VT = Op.getSimpleValueType();

10077

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10078, __extension__
__PRETTY_FUNCTION__))

10078

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10078, __extension__
__PRETTY_FUNCTION__));

10079

10080

SDLoc dl(Op);

10081

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

10082

ISD::isBuildVectorAllOnes(Op.getNode()))

10083

return Op;

10084

10085

uint64_t Immediate = 0;

10086

SmallVector<unsigned, 16> NonConstIdx;

10087

bool IsSplat = true;

10088

bool HasConstElts = false;

10089

int SplatIdx = -1;

10090

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

10091

SDValue In = Op.getOperand(idx);

10092

if (In.isUndef())

10093

continue;

10094

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

10095

Immediate |= (InC->getZExtValue() & 0x1) << idx;

10096

HasConstElts = true;

10097

} else {

10098

NonConstIdx.push_back(idx);

10099

}

10100

if (SplatIdx < 0)

10101

SplatIdx = idx;

10102

else if (In != Op.getOperand(SplatIdx))

10103

IsSplat = false;

10104

}

10105

10106

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

10107

if (IsSplat) {

10108

// The build_vector allows the scalar element to be larger than the vector

10109

// element type. We need to mask it to use as a condition unless we know

10110

// the upper bits are zero.

10111

// FIXME: Use computeKnownBits instead of checking specific opcode?

10112

SDValue Cond = Op.getOperand(SplatIdx);

10113

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10113, __extension__
__PRETTY_FUNCTION__));

10114

if (Cond.getOpcode() != ISD::SETCC)

10115

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

10116

DAG.getConstant(1, dl, MVT::i8));

10117

10118

// Perform the select in the scalar domain so we can use cmov.

10119

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10120

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

10121

DAG.getAllOnesConstant(dl, MVT::i32),

10122

DAG.getConstant(0, dl, MVT::i32));

10123

Select = DAG.getBitcast(MVT::v32i1, Select);

10124

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

10125

} else {

10126

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10127

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

10128

DAG.getAllOnesConstant(dl, ImmVT),

10129

DAG.getConstant(0, dl, ImmVT));

10130

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10131

Select = DAG.getBitcast(VecVT, Select);

10132

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

10133

DAG.getIntPtrConstant(0, dl));

10134

}

10135

}

10136

10137

// insert elements one by one

10138

SDValue DstVec;

10139

if (HasConstElts) {

10140

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10141

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

10142

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

10143

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

10144

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

10145

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

10146

} else {

10147

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10148

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

10149

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10150

DstVec = DAG.getBitcast(VecVT, Imm);

10151

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

10152

DAG.getIntPtrConstant(0, dl));

10153

}

10154

} else

10155

DstVec = DAG.getUNDEF(VT);

10156

10157

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

10158

unsigned InsertIdx = NonConstIdx[i];

10159

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

10160

Op.getOperand(InsertIdx),

10161

DAG.getIntPtrConstant(InsertIdx, dl));

10162

}

10163

return DstVec;

10164

}

10165

10166

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {

10167

switch (Opcode) {

10168

case X86ISD::PACKSS:

10169

case X86ISD::PACKUS:

10170

case X86ISD::FHADD:

10171

case X86ISD::FHSUB:

10172

case X86ISD::HADD:

10173

case X86ISD::HSUB:

10174

return true;

10175

}

10176

return false;

10177

}

10178

10179

/// This is a helper function of LowerToHorizontalOp().

10180

/// This function checks that the build_vector \p N in input implements a

10181

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

10182

/// may not match the layout of an x86 256-bit horizontal instruction.

10183

/// In other words, if this returns true, then some extraction/insertion will

10184

/// be required to produce a valid horizontal instruction.

10185

///

10186

/// Parameter \p Opcode defines the kind of horizontal operation to match.

10187

/// For example, if \p Opcode is equal to ISD::ADD, then this function

10188

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

10189

/// is equal to ISD::SUB, then this function checks if this is a horizontal

10190

/// arithmetic sub.

10191

///

10192

/// This function only analyzes elements of \p N whose indices are

10193

/// in range [BaseIdx, LastIdx).

10194

///

10195

/// TODO: This function was originally used to match both real and fake partial

10196

/// horizontal operations, but the index-matching logic is incorrect for that.

10197

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

10198

/// code because it is only used for partial h-op matching now?

10199

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

10200

SelectionDAG &DAG,

10201

unsigned BaseIdx, unsigned LastIdx,

10202

SDValue &V0, SDValue &V1) {

10203

EVT VT = N->getValueType(0);

10204

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__));

10205

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10205, __extension__
__PRETTY_FUNCTION__));

10206

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10207, __extension__
__PRETTY_FUNCTION__))

10207

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10207, __extension__
__PRETTY_FUNCTION__));

10208

10209

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

10210

bool CanFold = true;

10211

unsigned ExpectedVExtractIdx = BaseIdx;

10212

unsigned NumElts = LastIdx - BaseIdx;

10213

V0 = DAG.getUNDEF(VT);

10214

V1 = DAG.getUNDEF(VT);

10215

10216

// Check if N implements a horizontal binop.

10217

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

10218

SDValue Op = N->getOperand(i + BaseIdx);

10219

10220

// Skip UNDEFs.

10221

if (Op->isUndef()) {

10222

// Update the expected vector extract index.

10223

if (i * 2 == NumElts)

10224

ExpectedVExtractIdx = BaseIdx;

10225

ExpectedVExtractIdx += 2;

10226

continue;

10227

}

10228

10229

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

10230

10231

if (!CanFold)

10232

break;

10233

10234

SDValue Op0 = Op.getOperand(0);

10235

SDValue Op1 = Op.getOperand(1);

10236

10237

// Try to match the following pattern:

10238

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

10239

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10240

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10241

Op0.getOperand(0) == Op1.getOperand(0) &&

10242

isa<ConstantSDNode>(Op0.getOperand(1)) &&

10243

isa<ConstantSDNode>(Op1.getOperand(1)));

10244

if (!CanFold)

10245

break;

10246

10247

unsigned I0 = Op0.getConstantOperandVal(1);

10248

unsigned I1 = Op1.getConstantOperandVal(1);

10249

10250

if (i * 2 < NumElts) {

10251

if (V0.isUndef()) {

10252

V0 = Op0.getOperand(0);

10253

if (V0.getValueType() != VT)

10254

return false;

10255

}

10256

} else {

10257

if (V1.isUndef()) {

10258

V1 = Op0.getOperand(0);

10259

if (V1.getValueType() != VT)

10260

return false;

10261

}

10262

if (i * 2 == NumElts)

10263

ExpectedVExtractIdx = BaseIdx;

10264

}

10265

10266

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

10267

if (I0 == ExpectedVExtractIdx)

10268

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

10269

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

10270

// Try to match the following dag sequence:

10271

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

10272

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

10273

} else

10274

CanFold = false;

10275

10276

ExpectedVExtractIdx += 2;

10277

}

10278

10279

return CanFold;

10280

}

10281

10282

/// Emit a sequence of two 128-bit horizontal add/sub followed by

10283

/// a concat_vector.

10284

///

10285

/// This is a helper function of LowerToHorizontalOp().

10286

/// This function expects two 256-bit vectors called V0 and V1.

10287

/// At first, each vector is split into two separate 128-bit vectors.

10288

/// Then, the resulting 128-bit vectors are used to implement two

10289

/// horizontal binary operations.

10290

///

10291

/// The kind of horizontal binary operation is defined by \p X86Opcode.

10292

///

10293

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

10294

/// the two new horizontal binop.

10295

/// When Mode is set, the first horizontal binop dag node would take as input

10296

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

10297

/// horizontal binop dag node would take as input the lower 128-bit of V1

10298

/// and the upper 128-bit of V1.

10299

/// Example:

10300

/// HADD V0_LO, V0_HI

10301

/// HADD V1_LO, V1_HI

10302

///

10303

/// Otherwise, the first horizontal binop dag node takes as input the lower

10304

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

10305

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

10306

/// Example:

10307

/// HADD V0_LO, V1_LO

10308

/// HADD V0_HI, V1_HI

10309

///

10310

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

10311

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

10312

/// the upper 128-bits of the result.

10313

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

10314

const SDLoc &DL, SelectionDAG &DAG,

10315

unsigned X86Opcode, bool Mode,

10316

bool isUndefLO, bool isUndefHI) {

10317

MVT VT = V0.getSimpleValueType();

10318

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10319, __extension__
__PRETTY_FUNCTION__))

10319

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10319, __extension__
__PRETTY_FUNCTION__));

10320

10321

unsigned NumElts = VT.getVectorNumElements();

10322

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

10323

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

10324

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

10325

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

10326

MVT NewVT = V0_LO.getSimpleValueType();

10327

10328

SDValue LO = DAG.getUNDEF(NewVT);

10329

SDValue HI = DAG.getUNDEF(NewVT);

10330

10331

if (Mode) {

10332

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10333

if (!isUndefLO && !V0->isUndef())

10334

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

10335

if (!isUndefHI && !V1->isUndef())

10336

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

10337

} else {

10338

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10339

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

10340

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

10341

10342

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

10343

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

10344

}

10345

10346

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

10347

}

10348

10349

/// Returns true iff \p BV builds a vector with the result equivalent to

10350

/// the result of ADDSUB/SUBADD operation.

10351

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

10352

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

10353

/// \p Opnd0 and \p Opnd1.

10354

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

10355

const X86Subtarget &Subtarget, SelectionDAG &DAG,

10356

SDValue &Opnd0, SDValue &Opnd1,

10357

unsigned &NumExtracts,

10358

bool &IsSubAdd) {

10359

10360

MVT VT = BV->getSimpleValueType(0);

10361

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

10362

return false;

10363

10364

unsigned NumElts = VT.getVectorNumElements();

10365

SDValue InVec0 = DAG.getUNDEF(VT);

10366

SDValue InVec1 = DAG.getUNDEF(VT);

10367

10368

NumExtracts = 0;

10369

10370

// Odd-numbered elements in the input build vector are obtained from

10371

// adding/subtracting two integer/float elements.

10372

// Even-numbered elements in the input build vector are obtained from

10373

// subtracting/adding two integer/float elements.

10374

unsigned Opc[2] = {0, 0};

10375

for (unsigned i = 0, e = NumElts; i != e; ++i) {

10376

SDValue Op = BV->getOperand(i);

10377

10378

// Skip 'undef' values.

10379

unsigned Opcode = Op.getOpcode();

10380

if (Opcode == ISD::UNDEF)

10381

continue;

10382

10383

// Early exit if we found an unexpected opcode.

10384

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

10385

return false;

10386

10387

SDValue Op0 = Op.getOperand(0);

10388

SDValue Op1 = Op.getOperand(1);

10389

10390

// Try to match the following pattern:

10391

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

10392

// Early exit if we cannot match that sequence.

10393

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10394

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10395

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10396

Op0.getOperand(1) != Op1.getOperand(1))

10397

return false;

10398

10399

unsigned I0 = Op0.getConstantOperandVal(1);

10400

if (I0 != i)

10401

return false;

10402

10403

// We found a valid add/sub node, make sure its the same opcode as previous

10404

// elements for this parity.

10405

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

10406

return false;

10407

Opc[i % 2] = Opcode;

10408

10409

// Update InVec0 and InVec1.

10410

if (InVec0.isUndef()) {

10411

InVec0 = Op0.getOperand(0);

10412

if (InVec0.getSimpleValueType() != VT)

10413

return false;

10414

}

10415

if (InVec1.isUndef()) {

10416

InVec1 = Op1.getOperand(0);

10417

if (InVec1.getSimpleValueType() != VT)

10418

return false;

10419

}

10420

10421

// Make sure that operands in input to each add/sub node always

10422

// come from a same pair of vectors.

10423

if (InVec0 != Op0.getOperand(0)) {

10424

if (Opcode == ISD::FSUB)

10425

return false;

10426

10427

// FADD is commutable. Try to commute the operands

10428

// and then test again.

10429

std::swap(Op0, Op1);

10430

if (InVec0 != Op0.getOperand(0))

10431

return false;

10432

}

10433

10434

if (InVec1 != Op1.getOperand(0))

10435

return false;

10436

10437

// Increment the number of extractions done.

10438

++NumExtracts;

10439

}

10440

10441

// Ensure we have found an opcode for both parities and that they are

10442

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

10443

// inputs are undef.

10444

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

10445

InVec0.isUndef() || InVec1.isUndef())

10446

return false;

10447

10448

IsSubAdd = Opc[0] == ISD::FADD;

10449

10450

Opnd0 = InVec0;

10451

Opnd1 = InVec1;

10452

return true;

10453

}

10454

10455

/// Returns true if is possible to fold MUL and an idiom that has already been

10456

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

10457

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

10458

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

10459

///

10460

/// Prior to calling this function it should be known that there is some

10461

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

10462

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

10463

/// before replacement of such SDNode with ADDSUB operation. Thus the number

10464

/// of \p Opnd0 uses is expected to be equal to 2.

10465

/// For example, this function may be called for the following IR:

10466

/// %AB = fmul fast <2 x double> %A, %B

10467

/// %Sub = fsub fast <2 x double> %AB, %C

10468

/// %Add = fadd fast <2 x double> %AB, %C

10469

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

10470

/// <2 x i32> <i32 0, i32 3>

10471

/// There is a def for %Addsub here, which potentially can be replaced by

10472

/// X86ISD::ADDSUB operation:

10473

/// %Addsub = X86ISD::ADDSUB %AB, %C

10474

/// and such ADDSUB can further be replaced with FMADDSUB:

10475

/// %Addsub = FMADDSUB %A, %B, %C.

10476

///

10477

/// The main reason why this method is called before the replacement of the

10478

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

10479

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

10480

/// FMADDSUB is.

10481

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

10482

SelectionDAG &DAG,

10483

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

10484

unsigned ExpectedUses) {

10485

if (Opnd0.getOpcode() != ISD::FMUL ||

10486

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

10487

return false;

10488

10489

// FIXME: These checks must match the similar ones in

10490

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

10491

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

10492

// or MUL + ADDSUB to FMADDSUB.

10493

const TargetOptions &Options = DAG.getTarget().Options;

10494

bool AllowFusion =

10495

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

10496

if (!AllowFusion)

10497

return false;

10498

10499

Opnd2 = Opnd1;

10500

Opnd1 = Opnd0.getOperand(1);

10501

Opnd0 = Opnd0.getOperand(0);

10502

10503

return true;

10504

}

10505

10506

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

10507

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

10508

/// X86ISD::FMSUBADD node.

10509

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

10510

const X86Subtarget &Subtarget,

10511

SelectionDAG &DAG) {

10512

SDValue Opnd0, Opnd1;

10513

unsigned NumExtracts;

10514

bool IsSubAdd;

10515

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

10516

IsSubAdd))

10517

return SDValue();

10518

10519

MVT VT = BV->getSimpleValueType(0);

10520

SDLoc DL(BV);

10521

10522

// Try to generate X86ISD::FMADDSUB node here.

10523

SDValue Opnd2;

10524

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

10525

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

10526

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

10527

}

10528

10529

// We only support ADDSUB.

10530

if (IsSubAdd)

10531

return SDValue();

10532

10533

// There are no known X86 targets with 512-bit ADDSUB instructions!

10534

// Convert to blend(fsub,fadd).

10535

if (VT.is512BitVector()) {

10536

SmallVector<int> Mask;

10537

for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

10538

Mask.push_back(I);

10539

Mask.push_back(I + E + 1);

10540

}

10541

SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

10542

SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

10543

return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

10544

}

10545

10546

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

10547

}

10548

10549

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

10550

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

10551

// Initialize outputs to known values.

10552

MVT VT = BV->getSimpleValueType(0);

10553

HOpcode = ISD::DELETED_NODE;

10554

V0 = DAG.getUNDEF(VT);

10555

V1 = DAG.getUNDEF(VT);

10556

10557

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

10558

// half of the result is calculated independently from the 128-bit halves of

10559

// the inputs, so that makes the index-checking logic below more complicated.

10560

unsigned NumElts = VT.getVectorNumElements();

10561

unsigned GenericOpcode = ISD::DELETED_NODE;

10562

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

10563

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

10564

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

10565

for (unsigned i = 0; i != Num128BitChunks; ++i) {

10566

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

10567

// Ignore undef elements.

10568

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

10569

if (Op.isUndef())

10570

continue;

10571

10572

// If there's an opcode mismatch, we're done.

10573

if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

10574

return false;

10575

10576

// Initialize horizontal opcode.

10577

if (HOpcode == ISD::DELETED_NODE) {

10578

GenericOpcode = Op.getOpcode();

10579

switch (GenericOpcode) {

10580

case ISD::ADD: HOpcode = X86ISD::HADD; break;

10581

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

10582

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

10583

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

10584

default: return false;

10585

}

10586

}

10587

10588

SDValue Op0 = Op.getOperand(0);

10589

SDValue Op1 = Op.getOperand(1);

10590

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10591

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10592

Op0.getOperand(0) != Op1.getOperand(0) ||

10593

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10594

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

10595

return false;

10596

10597

// The source vector is chosen based on which 64-bit half of the

10598

// destination vector is being calculated.

10599

if (j < NumEltsIn64Bits) {

10600

if (V0.isUndef())

10601

V0 = Op0.getOperand(0);

10602

} else {

10603

if (V1.isUndef())

10604

V1 = Op0.getOperand(0);

10605

}

10606

10607

SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

10608

if (SourceVec != Op0.getOperand(0))

10609

return false;

10610

10611

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

10612

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

10613

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

10614

unsigned ExpectedIndex = i * NumEltsIn128Bits +

10615

(j % NumEltsIn64Bits) * 2;

10616

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

10617

continue;

10618

10619

// If this is not a commutative op, this does not match.

10620

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

10621

return false;

10622

10623

// Addition is commutative, so try swapping the extract indexes.

10624

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

10625

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

10626

continue;

10627

10628

// Extract indexes do not match horizontal requirement.

10629

return false;

10630

}

10631

}

10632

// We matched. Opcode and operands are returned by reference as arguments.

10633

return true;

10634

}

10635

10636

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

10637

SelectionDAG &DAG, unsigned HOpcode,

10638

SDValue V0, SDValue V1) {

10639

// If either input vector is not the same size as the build vector,

10640

// extract/insert the low bits to the correct size.

10641

// This is free (examples: zmm --> xmm, xmm --> ymm).

10642

MVT VT = BV->getSimpleValueType(0);

10643

unsigned Width = VT.getSizeInBits();

10644

if (V0.getValueSizeInBits() > Width)

10645

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

10646

else if (V0.getValueSizeInBits() < Width)

10647

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

10648

10649

if (V1.getValueSizeInBits() > Width)

10650

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

10651

else if (V1.getValueSizeInBits() < Width)

10652

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

10653

10654

unsigned NumElts = VT.getVectorNumElements();

10655

APInt DemandedElts = APInt::getAllOnes(NumElts);

10656

for (unsigned i = 0; i != NumElts; ++i)

10657

if (BV->getOperand(i).isUndef())

10658

DemandedElts.clearBit(i);

10659

10660

// If we don't need the upper xmm, then perform as a xmm hop.

10661

unsigned HalfNumElts = NumElts / 2;

10662

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

10663

MVT HalfVT = VT.getHalfNumVectorElementsVT();

10664

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

10665

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

10666

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

10667

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

10668

}

10669

10670

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

10671

}

10672

10673

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

10674

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

10675

const X86Subtarget &Subtarget,

10676

SelectionDAG &DAG) {

10677

// We need at least 2 non-undef elements to make this worthwhile by default.

10678

unsigned NumNonUndefs =

10679

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

10680

if (NumNonUndefs < 2)

10681

return SDValue();

10682

10683

// There are 4 sets of horizontal math operations distinguished by type:

10684

// int/FP at 128-bit/256-bit. Each type was introduced with a different

10685

// subtarget feature. Try to match those "native" patterns first.

10686

MVT VT = BV->getSimpleValueType(0);

10687

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

10688

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

10689

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

10690

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

10691

unsigned HOpcode;

10692

SDValue V0, V1;

10693

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

10694

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

10695

}

10696

10697

// Try harder to match 256-bit ops by using extract/concat.

10698

if (!Subtarget.hasAVX() || !VT.is256BitVector())

10699

return SDValue();

10700

10701

// Count the number of UNDEF operands in the build_vector in input.

10702

unsigned NumElts = VT.getVectorNumElements();

10703

unsigned Half = NumElts / 2;

10704

unsigned NumUndefsLO = 0;

10705

unsigned NumUndefsHI = 0;

10706

for (unsigned i = 0, e = Half; i != e; ++i)

10707

if (BV->getOperand(i)->isUndef())

10708

NumUndefsLO++;

10709

10710

for (unsigned i = Half, e = NumElts; i != e; ++i)

10711

if (BV->getOperand(i)->isUndef())

10712

NumUndefsHI++;

10713

10714

SDLoc DL(BV);

10715

SDValue InVec0, InVec1;

10716

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

10717

SDValue InVec2, InVec3;

10718

unsigned X86Opcode;

10719

bool CanFold = true;

10720

10721

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

10722

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

10723

InVec3) &&

10724

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10725

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10726

X86Opcode = X86ISD::HADD;

10727

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

10728

InVec1) &&

10729

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

10730

InVec3) &&

10731

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10732

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10733

X86Opcode = X86ISD::HSUB;

10734

else

10735

CanFold = false;

10736

10737

if (CanFold) {

10738

// Do not try to expand this build_vector into a pair of horizontal

10739

// add/sub if we can emit a pair of scalar add/sub.

10740

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10741

return SDValue();

10742

10743

// Convert this build_vector into a pair of horizontal binops followed by

10744

// a concat vector. We must adjust the outputs from the partial horizontal

10745

// matching calls above to account for undefined vector halves.

10746

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

10747

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

10748

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10748, __extension__
__PRETTY_FUNCTION__));

10749

bool isUndefLO = NumUndefsLO == Half;

10750

bool isUndefHI = NumUndefsHI == Half;

10751

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

10752

isUndefHI);

10753

}

10754

}

10755

10756

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

10757

VT == MVT::v16i16) {

10758

unsigned X86Opcode;

10759

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

10760

X86Opcode = X86ISD::HADD;

10761

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

10762

InVec1))

10763

X86Opcode = X86ISD::HSUB;

10764

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

10765

InVec1))

10766

X86Opcode = X86ISD::FHADD;

10767

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

10768

InVec1))

10769

X86Opcode = X86ISD::FHSUB;

10770

else

10771

return SDValue();

10772

10773

// Don't try to expand this build_vector into a pair of horizontal add/sub

10774

// if we can simply emit a pair of scalar add/sub.

10775

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10776

return SDValue();

10777

10778

// Convert this build_vector into two horizontal add/sub followed by

10779

// a concat vector.

10780

bool isUndefLO = NumUndefsLO == Half;

10781

bool isUndefHI = NumUndefsHI == Half;

10782

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

10783

isUndefLO, isUndefHI);

10784

}

10785

10786

return SDValue();

10787

}

10788

10789

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

10790

SelectionDAG &DAG);

10791

10792

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

10793

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

10794

/// just apply the bit to the vectors.

10795

/// NOTE: Its not in our interest to start make a general purpose vectorizer

10796

/// from this, but enough scalar bit operations are created from the later

10797

/// legalization + scalarization stages to need basic support.

10798

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

10799

const X86Subtarget &Subtarget,

10800

SelectionDAG &DAG) {

10801

SDLoc DL(Op);

10802

MVT VT = Op->getSimpleValueType(0);

10803

unsigned NumElems = VT.getVectorNumElements();

10804

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10805

10806

// Check that all elements have the same opcode.

10807

// TODO: Should we allow UNDEFS and if so how many?

10808

unsigned Opcode = Op->getOperand(0).getOpcode();

10809

for (unsigned i = 1; i < NumElems; ++i)

10810

if (Opcode != Op->getOperand(i).getOpcode())

10811

return SDValue();

10812

10813

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

10814

bool IsShift = false;

10815

switch (Opcode) {

10816

default:

10817

return SDValue();

10818

case ISD::SHL:

10819

case ISD::SRL:

10820

case ISD::SRA:

10821

IsShift = true;

10822

break;

10823

case ISD::AND:

10824

case ISD::XOR:

10825

case ISD::OR:

10826

// Don't do this if the buildvector is a splat - we'd replace one

10827

// constant with an entire vector.

10828

if (Op->getSplatValue())

10829

return SDValue();

10830

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

10831

return SDValue();

10832

break;

10833

}

10834

10835

SmallVector<SDValue, 4> LHSElts, RHSElts;

10836

for (SDValue Elt : Op->ops()) {

10837

SDValue LHS = Elt.getOperand(0);

10838

SDValue RHS = Elt.getOperand(1);

10839

10840

// We expect the canonicalized RHS operand to be the constant.

10841

if (!isa<ConstantSDNode>(RHS))

10842

return SDValue();

10843

10844

// Extend shift amounts.

10845

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

10846

if (!IsShift)

10847

return SDValue();

10848

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

10849

}

10850

10851

LHSElts.push_back(LHS);

10852

RHSElts.push_back(RHS);

10853

}

10854

10855

// Limit to shifts by uniform immediates.

10856

// TODO: Only accept vXi8/vXi64 special cases?

10857

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

10858

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

10859

return SDValue();

10860

10861

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

10862

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

10863

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

10864

10865

if (!IsShift)

10866

return Res;

10867

10868

// Immediately lower the shift to ensure the constant build vector doesn't

10869

// get converted to a constant pool before the shift is lowered.

10870

return LowerShift(Res, Subtarget, DAG);

10871

}

10872

10873

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

10874

/// functionality to do this, so it's all zeros, all ones, or some derivation

10875

/// that is cheap to calculate.

10876

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

10877

const X86Subtarget &Subtarget) {

10878

SDLoc DL(Op);

10879

MVT VT = Op.getSimpleValueType();

10880

10881

// Vectors containing all zeros can be matched by pxor and xorps.

10882

if (ISD::isBuildVectorAllZeros(Op.getNode()))

10883

return Op;

10884

10885

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

10886

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

10887

// vpcmpeqd on 256-bit vectors.

10888

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

10889

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

10890

return Op;

10891

10892

return getOnesVector(VT, DAG, DL);

10893

}

10894

10895

return SDValue();

10896

}

10897

10898

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

10899

/// from a vector of source values and a vector of extraction indices.

10900

/// The vectors might be manipulated to match the type of the permute op.

10901

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

10902

SDLoc &DL, SelectionDAG &DAG,

10903

const X86Subtarget &Subtarget) {

10904

MVT ShuffleVT = VT;

10905

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10906

unsigned NumElts = VT.getVectorNumElements();

10907

unsigned SizeInBits = VT.getSizeInBits();

10908

10909

// Adjust IndicesVec to match VT size.

10910

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10911, __extension__
__PRETTY_FUNCTION__))

10911

"Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10911, __extension__
__PRETTY_FUNCTION__));

10912

if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

10913

// Narrow/widen the indices vector to the correct size.

10914

if (IndicesVec.getValueSizeInBits() > SizeInBits)

10915

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

10916

NumElts * VT.getScalarSizeInBits());

10917

else if (IndicesVec.getValueSizeInBits() < SizeInBits)

10918

IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

10919

SDLoc(IndicesVec), SizeInBits);

10920

// Zero-extend the index elements within the vector.

10921

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

10922

IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

10923

IndicesVT, IndicesVec);

10924

}

10925

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

10926

10927

// Handle SrcVec that don't match VT type.

10928

if (SrcVec.getValueSizeInBits() != SizeInBits) {

10929

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

10930

// Handle larger SrcVec by treating it as a larger permute.

10931

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

10932

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

10933

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10934

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

10935

Subtarget, DAG, SDLoc(IndicesVec));

10936

SDValue NewSrcVec =

10937

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10938

if (NewSrcVec)

10939

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

10940

return SDValue();

10941

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

10942

// Widen smaller SrcVec to match VT.

10943

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

10944

} else

10945

return SDValue();

10946

}

10947

10948

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

10949

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10949, __extension__
__PRETTY_FUNCTION__));

10950

EVT SrcVT = Idx.getValueType();

10951

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

10952

uint64_t IndexScale = 0;

10953

uint64_t IndexOffset = 0;

10954

10955

// If we're scaling a smaller permute op, then we need to repeat the

10956

// indices, scaling and offsetting them as well.

10957

// e.g. v4i32 -> v16i8 (Scale = 4)

10958

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

10959

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

10960

for (uint64_t i = 0; i != Scale; ++i) {

10961

IndexScale |= Scale << (i * NumDstBits);

10962

IndexOffset |= i << (i * NumDstBits);

10963

}

10964

10965

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

10966

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

10967

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

10968

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

10969

return Idx;

10970

};

10971

10972

unsigned Opcode = 0;

10973

switch (VT.SimpleTy) {

10974

default:

10975

break;

10976

case MVT::v16i8:

10977

if (Subtarget.hasSSSE3())

10978

Opcode = X86ISD::PSHUFB;

10979

break;

10980

case MVT::v8i16:

10981

if (Subtarget.hasVLX() && Subtarget.hasBWI())

10982

Opcode = X86ISD::VPERMV;

10983

else if (Subtarget.hasSSSE3()) {

10984

Opcode = X86ISD::PSHUFB;

10985

ShuffleVT = MVT::v16i8;

10986

}

10987

break;

10988

case MVT::v4f32:

10989

case MVT::v4i32:

10990

if (Subtarget.hasAVX()) {

10991

Opcode = X86ISD::VPERMILPV;

10992

ShuffleVT = MVT::v4f32;

10993

} else if (Subtarget.hasSSSE3()) {

10994

Opcode = X86ISD::PSHUFB;

10995

ShuffleVT = MVT::v16i8;

10996

}

10997

break;

10998

case MVT::v2f64:

10999

case MVT::v2i64:

11000

if (Subtarget.hasAVX()) {

11001

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

11002

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11003

Opcode = X86ISD::VPERMILPV;

11004

ShuffleVT = MVT::v2f64;

11005

} else if (Subtarget.hasSSE41()) {

11006

// SSE41 can compare v2i64 - select between indices 0 and 1.

11007

return DAG.getSelectCC(

11008

DL, IndicesVec,

11009

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

11010

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

11011

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

11012

ISD::CondCode::SETEQ);

11013

}

11014

break;

11015

case MVT::v32i8:

11016

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

11017

Opcode = X86ISD::VPERMV;

11018

else if (Subtarget.hasXOP()) {

11019

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

11020

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

11021

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

11022

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

11023

return DAG.getNode(

11024

ISD::CONCAT_VECTORS, DL, VT,

11025

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

11026

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

11027

} else if (Subtarget.hasAVX()) {

11028

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

11029

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

11030

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

11031

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

11032

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

11033

ArrayRef<SDValue> Ops) {

11034

// Permute Lo and Hi and then select based on index range.

11035

// This works as SHUFB uses bits[3:0] to permute elements and we don't

11036

// care about the bit[7] as its just an index vector.

11037

SDValue Idx = Ops[2];

11038

EVT VT = Idx.getValueType();

11039

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

11040

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

11041

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

11042

ISD::CondCode::SETGT);

11043

};

11044

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

11045

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

11046

PSHUFBBuilder);

11047

}

11048

break;

11049

case MVT::v16i16:

11050

if (Subtarget.hasVLX() && Subtarget.hasBWI())

11051

Opcode = X86ISD::VPERMV;

11052

else if (Subtarget.hasAVX()) {

11053

// Scale to v32i8 and perform as v32i8.

11054

IndicesVec = ScaleIndices(IndicesVec, 2);

11055

return DAG.getBitcast(

11056

VT, createVariablePermute(

11057

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

11058

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

11059

}

11060

break;

11061

case MVT::v8f32:

11062

case MVT::v8i32:

11063

if (Subtarget.hasAVX2())

11064

Opcode = X86ISD::VPERMV;

11065

else if (Subtarget.hasAVX()) {

11066

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

11067

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11068

{0, 1, 2, 3, 0, 1, 2, 3});

11069

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11070

{4, 5, 6, 7, 4, 5, 6, 7});

11071

if (Subtarget.hasXOP())

11072

return DAG.getBitcast(

11073

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

11074

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11075

// Permute Lo and Hi and then select based on index range.

11076

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

11077

SDValue Res = DAG.getSelectCC(

11078

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

11079

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

11080

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

11081

ISD::CondCode::SETGT);

11082

return DAG.getBitcast(VT, Res);

11083

}

11084

break;

11085

case MVT::v4i64:

11086

case MVT::v4f64:

11087

if (Subtarget.hasAVX512()) {

11088

if (!Subtarget.hasVLX()) {

11089

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

11090

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

11091

SDLoc(SrcVec));

11092

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

11093

DAG, SDLoc(IndicesVec));

11094

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

11095

DAG, Subtarget);

11096

return extract256BitVector(Res, 0, DAG, DL);

11097

}

11098

Opcode = X86ISD::VPERMV;

11099

} else if (Subtarget.hasAVX()) {

11100

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

11101

SDValue LoLo =

11102

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

11103

SDValue HiHi =

11104

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

11105

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

11106

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11107

if (Subtarget.hasXOP())

11108

return DAG.getBitcast(

11109

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

11110

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11111

// Permute Lo and Hi and then select based on index range.

11112

// This works as VPERMILPD only uses index bit[1] to permute elements.

11113

SDValue Res = DAG.getSelectCC(

11114

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

11115

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

11116

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

11117

ISD::CondCode::SETGT);

11118

return DAG.getBitcast(VT, Res);

11119

}

11120

break;

11121

case MVT::v64i8:

11122

if (Subtarget.hasVBMI())

11123

Opcode = X86ISD::VPERMV;

11124

break;

11125

case MVT::v32i16:

11126

if (Subtarget.hasBWI())

11127

Opcode = X86ISD::VPERMV;

11128

break;

11129

case MVT::v16f32:

11130

case MVT::v16i32:

11131

case MVT::v8f64:

11132

case MVT::v8i64:

11133

if (Subtarget.hasAVX512())

11134

Opcode = X86ISD::VPERMV;

11135

break;

11136

}

11137

if (!Opcode)

11138

return SDValue();

11139

11140

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11142, __extension__
__PRETTY_FUNCTION__))

11141

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11142, __extension__
__PRETTY_FUNCTION__))

11142

"Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11142, __extension__
__PRETTY_FUNCTION__));

11143

11144

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

11145

if (Scale > 1)

11146

IndicesVec = ScaleIndices(IndicesVec, Scale);

11147

11148

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

11149

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

11150

11151

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

11152

SDValue Res = Opcode == X86ISD::VPERMV

11153

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

11154

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

11155

return DAG.getBitcast(VT, Res);

11156

}

11157

11158

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

11159

// reasoned to be a permutation of a vector by indices in a non-constant vector.

11160

// (build_vector (extract_elt V, (extract_elt I, 0)),

11161

// (extract_elt V, (extract_elt I, 1)),

11162

// ...

11163

// ->

11164

// (vpermv I, V)

11165

//

11166

// TODO: Handle undefs

11167

// TODO: Utilize pshufb and zero mask blending to support more efficient

11168

// construction of vectors with constant-0 elements.

11169

static SDValue

11170

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

11171

const X86Subtarget &Subtarget) {

11172

SDValue SrcVec, IndicesVec;

11173

// Check for a match of the permute source vector and permute index elements.

11174

// This is done by checking that the i-th build_vector operand is of the form:

11175

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

11176

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

11177

SDValue Op = V.getOperand(Idx);

11178

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11179

return SDValue();

11180

11181

// If this is the first extract encountered in V, set the source vector,

11182

// otherwise verify the extract is from the previously defined source

11183

// vector.

11184

if (!SrcVec)

11185

SrcVec = Op.getOperand(0);

11186

else if (SrcVec != Op.getOperand(0))

11187

return SDValue();

11188

SDValue ExtractedIndex = Op->getOperand(1);

11189

// Peek through extends.

11190

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

11191

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

11192

ExtractedIndex = ExtractedIndex.getOperand(0);

11193

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11194

return SDValue();

11195

11196

// If this is the first extract from the index vector candidate, set the

11197

// indices vector, otherwise verify the extract is from the previously

11198

// defined indices vector.

11199

if (!IndicesVec)

11200

IndicesVec = ExtractedIndex.getOperand(0);

11201

else if (IndicesVec != ExtractedIndex.getOperand(0))

11202

return SDValue();

11203

11204

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

11205

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

11206

return SDValue();

11207

}

11208

11209

SDLoc DL(V);

11210

MVT VT = V.getSimpleValueType();

11211

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

11212

}

11213

11214

SDValue

11215

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

11216

SDLoc dl(Op);

11217

11218

MVT VT = Op.getSimpleValueType();

11219

MVT EltVT = VT.getVectorElementType();

11220

MVT OpEltVT = Op.getOperand(0).getSimpleValueType();

11221

unsigned NumElems = Op.getNumOperands();

11222

11223

// Generate vectors for predicate vectors.

11224

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

11225

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

11226

11227

if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())

11228

return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);

11229

11230

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

11231

return VectorConstant;

11232

11233

unsigned EVTBits = EltVT.getSizeInBits();

11234

APInt UndefMask = APInt::getZero(NumElems);

11235

APInt FrozenUndefMask = APInt::getZero(NumElems);

11236

APInt ZeroMask = APInt::getZero(NumElems);

11237

APInt NonZeroMask = APInt::getZero(NumElems);

11238

bool IsAllConstants = true;

11239

SmallSet<SDValue, 8> Values;

11240

unsigned NumConstants = NumElems;

11241

for (unsigned i = 0; i < NumElems; ++i) {

11242

SDValue Elt = Op.getOperand(i);

11243

if (Elt.isUndef()) {

11244

UndefMask.setBit(i);

11245

continue;

11246

}

11247

if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {

11248

FrozenUndefMask.setBit(i);

11249

continue;

11250

}

11251

Values.insert(Elt);

11252

if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {

11253

IsAllConstants = false;

11254

NumConstants--;

11255

}

11256

if (X86::isZeroNode(Elt)) {

11257

ZeroMask.setBit(i);

11258

} else {

11259

NonZeroMask.setBit(i);

11260

}

11261

}

11262

11263

// All undef vector. Return an UNDEF.

11264

if (UndefMask.isAllOnes())

11265

return DAG.getUNDEF(VT);

11266

11267

// If we have multiple FREEZE-UNDEF operands, we are likely going to end up

11268

// lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in

11269

// our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,

11270

// and blend the FREEZE-UNDEF operands back in.

11271

// FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?

11272

if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();

11273

NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {

11274

SmallVector<int, 16> BlendMask(NumElems, -1);

11275

SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));

11276

for (unsigned i = 0; i < NumElems; ++i) {

11277

if (UndefMask[i]) {

11278

BlendMask[i] = -1;

11279

continue;

11280

}

11281

BlendMask[i] = i;

11282

if (!FrozenUndefMask[i])

11283

Elts[i] = Op.getOperand(i);

11284

else

11285

BlendMask[i] += NumElems;

11286

}

11287

SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);

11288

SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));

11289

SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);

11290

return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);

11291

}

11292

11293

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

11294

11295

// If the upper elts of a ymm/zmm are undef/zero then we might be better off

11296

// lowering to a smaller build vector and padding with undef/zero.

11297

if ((VT.is256BitVector() || VT.is512BitVector()) &&

11298

!isFoldableUseOfShuffle(BV)) {

11299

unsigned UpperElems = NumElems / 2;

11300

APInt UndefOrZeroMask = UndefMask | ZeroMask;

11301

unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();

11302

if (NumUpperUndefsOrZeros >= UpperElems) {

11303

if (VT.is512BitVector() &&

11304

NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

11305

UpperElems = NumElems - (NumElems / 4);

11306

bool UndefUpper = UndefMask.countl_one() >= UpperElems;

11307

MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

11308

SDValue NewBV =

11309

DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

11310

return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

11311

}

11312

}

11313

11314

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

11315

return AddSub;

11316

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

11317

return HorizontalOp;

11318

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

11319

return Broadcast;

11320

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

11321

return BitOp;

11322

11323

unsigned NumZero = ZeroMask.popcount();

11324

unsigned NumNonZero = NonZeroMask.popcount();

11325

11326

// If we are inserting one variable into a vector of non-zero constants, try

11327

// to avoid loading each constant element as a scalar. Load the constants as a

11328

// vector and then insert the variable scalar element. If insertion is not

11329

// supported, fall back to a shuffle to get the scalar blended with the

11330

// constants. Insertion into a zero vector is handled as a special-case

11331

// somewhere below here.

11332

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

11333

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

11334

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

11335

// Create an all-constant vector. The variable element in the old

11336

// build vector is replaced by undef in the constant vector. Save the

11337

// variable scalar element and its index for use in the insertelement.

11338

LLVMContext &Context = *DAG.getContext();

11339

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

11340

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

11341

SDValue VarElt;

11342

SDValue InsIndex;

11343

for (unsigned i = 0; i != NumElems; ++i) {

11344

SDValue Elt = Op.getOperand(i);

11345

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

11346

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

11347

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

11348

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

11349

else if (!Elt.isUndef()) {

11350

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11351, __extension__
__PRETTY_FUNCTION__))

11351

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11351, __extension__
__PRETTY_FUNCTION__));

11352

VarElt = Elt;

11353

InsIndex = DAG.getVectorIdxConstant(i, dl);

11354

}

11355

}

11356

Constant *CV = ConstantVector::get(ConstVecOps);

11357

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

11358

11359

// The constants we just created may not be legal (eg, floating point). We

11360

// must lower the vector right here because we can not guarantee that we'll

11361

// legalize it before loading it. This is also why we could not just create

11362

// a new build vector here. If the build vector contains illegal constants,

11363

// it could get split back up into a series of insert elements.

11364

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

11365

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

11366

MachineFunction &MF = DAG.getMachineFunction();

11367

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

11368

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

11369

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

11370

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

11371

if (InsertC < NumEltsInLow128Bits)

11372

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

11373

11374

// There's no good way to insert into the high elements of a >128-bit

11375

// vector, so use shuffles to avoid an extract/insert sequence.

11376

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11376, __extension__
__PRETTY_FUNCTION__));

11377

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11377, __extension__
__PRETTY_FUNCTION__));

11378

SmallVector<int, 8> ShuffleMask;

11379

unsigned NumElts = VT.getVectorNumElements();

11380

for (unsigned i = 0; i != NumElts; ++i)

11381

ShuffleMask.push_back(i == InsertC ? NumElts : i);

11382

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

11383

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

11384

}

11385

11386

// Special case for single non-zero, non-undef, element.

11387

if (NumNonZero == 1) {

11388

unsigned Idx = NonZeroMask.countr_zero();

11389

SDValue Item = Op.getOperand(Idx);

11390

11391

// If we have a constant or non-constant insertion into the low element of

11392

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

11393

// the rest of the elements. This will be matched as movd/movq/movss/movsd

11394

// depending on what the source datatype is.

11395

if (Idx == 0) {

11396

if (NumZero == 0)

11397

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11398

11399

if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

11400

EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

11401

(EltVT == MVT::i16 && Subtarget.hasFP16())) {

11402

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11404, __extension__
__PRETTY_FUNCTION__))

11403

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11404, __extension__
__PRETTY_FUNCTION__))

11404

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11404, __extension__
__PRETTY_FUNCTION__));

11405

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11406

// Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

11407

// zero vector.

11408

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11409

}

11410

11411

// We can't directly insert an i8 or i16 into a vector, so zero extend

11412

// it to i32 first.

11413

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

11414

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

11415

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

11416

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

11417

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11418

return DAG.getBitcast(VT, Item);

11419

}

11420

}

11421

11422

// Is it a vector logical left shift?

11423

if (NumElems == 2 && Idx == 1 &&

11424

X86::isZeroNode(Op.getOperand(0)) &&

11425

!X86::isZeroNode(Op.getOperand(1))) {

11426

unsigned NumBits = VT.getSizeInBits();

11427

return getVShift(true, VT,

11428

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11429

VT, Op.getOperand(1)),

11430

NumBits/2, DAG, *this, dl);

11431

}

11432

11433

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

11434

return SDValue();

11435

11436

// Otherwise, if this is a vector with i32 or f32 elements, and the element

11437

// is a non-constant being inserted into an element other than the low one,

11438

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

11439

// movd/movss) to move this into the low element, then shuffle it into

11440

// place.

11441

if (EVTBits == 32) {

11442

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11443

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

11444

}

11445

}

11446

11447

// Splat is obviously ok. Let legalizer expand it to a shuffle.

11448

if (Values.size() == 1) {

11449

if (EVTBits == 32) {

11450

// Instead of a shuffle like this:

11451

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

11452

// Check if it's possible to issue this instead.

11453

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

11454

unsigned Idx = NonZeroMask.countr_zero();

11455

SDValue Item = Op.getOperand(Idx);

11456

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

11457

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

11458

}

11459

return SDValue();

11460

}

11461

11462

// A vector full of immediates; various special cases are already

11463

// handled, so this is best done with a single constant-pool load.

11464

if (IsAllConstants)

11465

return SDValue();

11466

11467

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

11468

return V;

11469

11470

// See if we can use a vector load to get all of the elements.

11471

{

11472

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

11473

if (SDValue LD =

11474

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

11475

return LD;

11476

}

11477

11478

// If this is a splat of pairs of 32-bit elements, we can use a narrower

11479

// build_vector and broadcast it.

11480

// TODO: We could probably generalize this more.

11481

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

11482

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

11483

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

11484

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

11485

// Make sure all the even/odd operands match.

11486

for (unsigned i = 2; i != NumElems; ++i)

11487

if (Ops[i % 2] != Op.getOperand(i))

11488

return false;

11489

return true;

11490

};

11491

if (CanSplat(Op, NumElems, Ops)) {

11492

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

11493

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

11494

// Create a new build vector and cast to v2i64/v2f64.

11495

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

11496

DAG.getBuildVector(NarrowVT, dl, Ops));

11497

// Broadcast from v2i64/v2f64 and cast to final VT.

11498

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

11499

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

11500

NewBV));

11501

}

11502

}

11503

11504

// For AVX-length vectors, build the individual 128-bit pieces and use

11505

// shuffles to put them in place.

11506

if (VT.getSizeInBits() > 128) {

11507

MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);

11508

11509

// Build both the lower and upper subvector.

11510

SDValue Lower =

11511

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

11512

SDValue Upper = DAG.getBuildVector(

11513

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

11514

11515

// Recreate the wider vector with the lower and upper part.

11516

return concatSubVectors(Lower, Upper, DAG, dl);

11517

}

11518

11519

// Let legalizer expand 2-wide build_vectors.

11520

if (EVTBits == 64) {

11521

if (NumNonZero == 1) {

11522

// One half is zero or undef.

11523

unsigned Idx = NonZeroMask.countr_zero();

11524

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

11525

Op.getOperand(Idx));

11526

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

11527

}

11528

return SDValue();

11529

}

11530

11531

// If element VT is < 32 bits, convert it to inserts into a zero vector.

11532

if (EVTBits == 8 && NumElems == 16)

11533

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,

11534

DAG, Subtarget))

11535

return V;

11536

11537

if (EltVT == MVT::i16 && NumElems == 8)

11538

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,

11539

DAG, Subtarget))

11540

return V;

11541

11542

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

11543

if (EVTBits == 32 && NumElems == 4)

11544

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

11545

return V;

11546

11547

// If element VT is == 32 bits, turn it into a number of shuffles.

11548

if (NumElems == 4 && NumZero > 0) {

11549

SmallVector<SDValue, 8> Ops(NumElems);

11550

for (unsigned i = 0; i < 4; ++i) {

11551

bool isZero = !NonZeroMask[i];

11552

if (isZero)

11553

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

11554

else

11555

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11556

}

11557

11558

for (unsigned i = 0; i < 2; ++i) {

11559

switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

11560

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11560);

11561

case 0:

11562

Ops[i] = Ops[i*2]; // Must be a zero vector.

11563

break;

11564

case 1:

11565

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

11566

break;

11567

case 2:

11568

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11569

break;

11570

case 3:

11571

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11572

break;

11573

}

11574

}

11575

11576

bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

11577

bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

11578

int MaskVec[] = {

11579

Reverse1 ? 1 : 0,

11580

Reverse1 ? 0 : 1,

11581

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

11582

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

11583

};

11584

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

11585

}

11586

11587

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11587, __extension__
__PRETTY_FUNCTION__));

11588

11589

// Check for a build vector from mostly shuffle plus few inserting.

11590

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

11591

return Sh;

11592

11593

// For SSE 4.1, use insertps to put the high elements into the low element.

11594

if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

11595

SDValue Result;

11596

if (!Op.getOperand(0).isUndef())

11597

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

11598

else

11599

Result = DAG.getUNDEF(VT);

11600

11601

for (unsigned i = 1; i < NumElems; ++i) {

11602

if (Op.getOperand(i).isUndef()) continue;

11603

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

11604

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

11605

}

11606

return Result;

11607

}

11608

11609

// Otherwise, expand into a number of unpckl*, start by extending each of

11610

// our (non-undef) elements to the full vector width with the element in the

11611

// bottom slot of the vector (which generates no code for SSE).

11612

SmallVector<SDValue, 8> Ops(NumElems);

11613

for (unsigned i = 0; i < NumElems; ++i) {

11614

if (!Op.getOperand(i).isUndef())

11615

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11616

else

11617

Ops[i] = DAG.getUNDEF(VT);

11618

}

11619

11620

// Next, we iteratively mix elements, e.g. for v4f32:

11621

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

11622

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

11623

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

11624

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

11625

// Generate scaled UNPCKL shuffle mask.

11626

SmallVector<int, 16> Mask;

11627

for(unsigned i = 0; i != Scale; ++i)

11628

Mask.push_back(i);

11629

for (unsigned i = 0; i != Scale; ++i)

11630

Mask.push_back(NumElems+i);

11631

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

11632

11633

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

11634

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

11635

}

11636

return Ops[0];

11637

}

11638

11639

// 256-bit AVX can use the vinsertf128 instruction

11640

// to create 256-bit vectors from two other 128-bit ones.

11641

// TODO: Detect subvector broadcast here instead of DAG combine?

11642

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

11643

const X86Subtarget &Subtarget) {

11644

SDLoc dl(Op);

11645

MVT ResVT = Op.getSimpleValueType();

11646

11647

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11648, __extension__
__PRETTY_FUNCTION__))

11648

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11648, __extension__
__PRETTY_FUNCTION__));

11649

11650

unsigned NumOperands = Op.getNumOperands();

11651

unsigned NumFreezeUndef = 0;

11652

unsigned NumZero = 0;

11653

unsigned NumNonZero = 0;

11654

unsigned NonZeros = 0;

11655

for (unsigned i = 0; i != NumOperands; ++i) {

11656

SDValue SubVec = Op.getOperand(i);

11657

if (SubVec.isUndef())

11658

continue;

11659

if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())

11660

++NumFreezeUndef;

11661

else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11662

++NumZero;

11663

else {

11664

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11664, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11665

NonZeros |= 1 << i;

11666

++NumNonZero;

11667

}

11668

}

11669

11670

// If we have more than 2 non-zeros, build each half separately.

11671

if (NumNonZero > 2) {

11672

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11673

ArrayRef<SDUse> Ops = Op->ops();

11674

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11675

Ops.slice(0, NumOperands/2));

11676

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11677

Ops.slice(NumOperands/2));

11678

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11679

}

11680

11681

// Otherwise, build it up through insert_subvectors.

11682

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

11683

: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))

11684

: DAG.getUNDEF(ResVT));

11685

11686

MVT SubVT = Op.getOperand(0).getSimpleValueType();

11687

unsigned NumSubElems = SubVT.getVectorNumElements();

11688

for (unsigned i = 0; i != NumOperands; ++i) {

11689

if ((NonZeros & (1 << i)) == 0)

11690

continue;

11691

11692

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

11693

Op.getOperand(i),

11694

DAG.getIntPtrConstant(i * NumSubElems, dl));

11695

}

11696

11697

return Vec;

11698

}

11699

11700

// Returns true if the given node is a type promotion (by concatenating i1

11701

// zeros) of the result of a node that already zeros all upper bits of

11702

// k-register.

11703

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

11704

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

11705

const X86Subtarget &Subtarget,

11706

SelectionDAG & DAG) {

11707

SDLoc dl(Op);

11708

MVT ResVT = Op.getSimpleValueType();

11709

unsigned NumOperands = Op.getNumOperands();

11710

11711

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11712, __extension__
__PRETTY_FUNCTION__))

11712

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11712, __extension__
__PRETTY_FUNCTION__));

11713

11714

uint64_t Zeros = 0;

11715

uint64_t NonZeros = 0;

11716

for (unsigned i = 0; i != NumOperands; ++i) {

11717

SDValue SubVec = Op.getOperand(i);

11718

if (SubVec.isUndef())

11719

continue;

11720

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11720, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11721

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11722

Zeros |= (uint64_t)1 << i;

11723

else

11724

NonZeros |= (uint64_t)1 << i;

11725

}

11726

11727

unsigned NumElems = ResVT.getVectorNumElements();

11728

11729

// If we are inserting non-zero vector and there are zeros in LSBs and undef

11730

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

11731

// insert_subvector will give us two kshifts.

11732

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

11733

Log2_64(NonZeros) != NumOperands - 1) {

11734

MVT ShiftVT = ResVT;

11735

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

11736

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

11737

unsigned Idx = Log2_64(NonZeros);

11738

SDValue SubVec = Op.getOperand(Idx);

11739

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11740

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

11741

DAG.getUNDEF(ShiftVT), SubVec,

11742

DAG.getIntPtrConstant(0, dl));

11743

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

11744

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

11745

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

11746

DAG.getIntPtrConstant(0, dl));

11747

}

11748

11749

// If there are zero or one non-zeros we can handle this very simply.

11750

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

11751

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

11752

if (!NonZeros)

11753

return Vec;

11754

unsigned Idx = Log2_64(NonZeros);

11755

SDValue SubVec = Op.getOperand(Idx);

11756

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11757

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

11758

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

11759

}

11760

11761

if (NumOperands > 2) {

11762

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11763

ArrayRef<SDUse> Ops = Op->ops();

11764

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11765

Ops.slice(0, NumOperands/2));

11766

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11767

Ops.slice(NumOperands/2));

11768

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11769

}

11770

11771

assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11771, __extension__
__PRETTY_FUNCTION__));

11772

11773

if (ResVT.getVectorNumElements() >= 16)

11774

return Op; // The operation is legal with KUNPCK

11775

11776

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

11777

DAG.getUNDEF(ResVT), Op.getOperand(0),

11778

DAG.getIntPtrConstant(0, dl));

11779

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

11780

DAG.getIntPtrConstant(NumElems/2, dl));

11781

}

11782

11783

static SDValue LowerCONCAT_VECTORS(SDValue Op,

11784

const X86Subtarget &Subtarget,

11785

SelectionDAG &DAG) {

11786

MVT VT = Op.getSimpleValueType();

11787

if (VT.getVectorElementType() == MVT::i1)

11788

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

11789

11790

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11792, __extension__
__PRETTY_FUNCTION__))

11791

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11792, __extension__
__PRETTY_FUNCTION__))

11792

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11792, __extension__
__PRETTY_FUNCTION__));

11793

11794

// AVX can use the vinsertf128 instruction to create 256-bit vectors

11795

// from two other 128-bit ones.

11796

11797

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

11798

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

11799

}

11800

11801

//===----------------------------------------------------------------------===//

11802

// Vector shuffle lowering

11803

//

11804

// This is an experimental code path for lowering vector shuffles on x86. It is

11805

// designed to handle arbitrary vector shuffles and blends, gracefully

11806

// degrading performance as necessary. It works hard to recognize idiomatic

11807

// shuffles and lower them to optimal instruction patterns without leaving

11808

// a framework that allows reasonably efficient handling of all vector shuffle

11809

// patterns.

11810

//===----------------------------------------------------------------------===//

11811

11812

/// Tiny helper function to identify a no-op mask.

11813

///

11814

/// This is a somewhat boring predicate function. It checks whether the mask

11815

/// array input, which is assumed to be a single-input shuffle mask of the kind

11816

/// used by the X86 shuffle instructions (not a fully general

11817

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

11818

/// in-place shuffle are 'no-op's.

11819

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

11820

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11821

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11821, __extension__
__PRETTY_FUNCTION__));

11822

if (Mask[i] >= 0 && Mask[i] != i)

11823

return false;

11824

}

11825

return true;

11826

}

11827

11828

/// Test whether there are elements crossing LaneSizeInBits lanes in this

11829

/// shuffle mask.

11830

///

11831

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

11832

/// and we routinely test for these.

11833

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

11834

unsigned ScalarSizeInBits,

11835

ArrayRef<int> Mask) {

11836

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11838, __extension__
__PRETTY_FUNCTION__))

11837

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11838, __extension__
__PRETTY_FUNCTION__))

11838

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11838, __extension__
__PRETTY_FUNCTION__));

11839

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

11840

int Size = Mask.size();

11841

for (int i = 0; i < Size; ++i)

11842

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

11843

return true;

11844

return false;

11845

}

11846

11847

/// Test whether there are elements crossing 128-bit lanes in this

11848

/// shuffle mask.

11849

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

11850

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

11851

}

11852

11853

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

11854

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

11855

/// better support 'repeated mask + lane permute' style shuffles.

11856

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

11857

unsigned ScalarSizeInBits,

11858

ArrayRef<int> Mask) {

11859

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11861, __extension__
__PRETTY_FUNCTION__))

11860

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11861, __extension__
__PRETTY_FUNCTION__))

11861

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11861, __extension__
__PRETTY_FUNCTION__));

11862

int NumElts = Mask.size();

11863

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

11864

int NumLanes = NumElts / NumEltsPerLane;

11865

if (NumLanes > 1) {

11866

for (int i = 0; i != NumLanes; ++i) {

11867

int SrcLane = -1;

11868

for (int j = 0; j != NumEltsPerLane; ++j) {

11869

int M = Mask[(i * NumEltsPerLane) + j];

11870

if (M < 0)

11871

continue;

11872

int Lane = (M % NumElts) / NumEltsPerLane;

11873

if (SrcLane >= 0 && SrcLane != Lane)

11874

return true;

11875

SrcLane = Lane;

11876

}

11877

}

11878

}

11879

return false;

11880

}

11881

11882

/// Test whether a shuffle mask is equivalent within each sub-lane.

11883

///

11884

/// This checks a shuffle mask to see if it is performing the same

11885

/// lane-relative shuffle in each sub-lane. This trivially implies

11886

/// that it is also not lane-crossing. It may however involve a blend from the

11887

/// same lane of a second vector.

11888

///

11889

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

11890

/// non-trivial to compute in the face of undef lanes. The representation is

11891

/// suitable for use with existing 128-bit shuffles as entries from the second

11892

/// vector have been remapped to [LaneSize, 2*LaneSize).

11893

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

11894

ArrayRef<int> Mask,

11895

SmallVectorImpl<int> &RepeatedMask) {

11896

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

11897

RepeatedMask.assign(LaneSize, -1);

11898

int Size = Mask.size();

11899

for (int i = 0; i < Size; ++i) {

11900

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11900, __extension__
__PRETTY_FUNCTION__));

11901

if (Mask[i] < 0)

11902

continue;

11903

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11904

// This entry crosses lanes, so there is no way to model this shuffle.

11905

return false;

11906

11907

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

11908

// Adjust second vector indices to start at LaneSize instead of Size.

11909

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

11910

: Mask[i] % LaneSize + LaneSize;

11911

if (RepeatedMask[i % LaneSize] < 0)

11912

// This is the first non-undef entry in this slot of a 128-bit lane.

11913

RepeatedMask[i % LaneSize] = LocalM;

11914

else if (RepeatedMask[i % LaneSize] != LocalM)

11915

// Found a mismatch with the repeated mask.

11916

return false;

11917

}

11918

return true;

11919

}

11920

11921

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

11922

static bool

11923

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11924

SmallVectorImpl<int> &RepeatedMask) {

11925

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11926

}

11927

11928

static bool

11929

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

11930

SmallVector<int, 32> RepeatedMask;

11931

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11932

}

11933

11934

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

11935

static bool

11936

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11937

SmallVectorImpl<int> &RepeatedMask) {

11938

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

11939

}

11940

11941

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11942

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11943

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

11944

unsigned EltSizeInBits,

11945

ArrayRef<int> Mask,

11946

SmallVectorImpl<int> &RepeatedMask) {

11947

int LaneSize = LaneSizeInBits / EltSizeInBits;

11948

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

11949

int Size = Mask.size();

11950

for (int i = 0; i < Size; ++i) {

11951

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11951, __extension__
__PRETTY_FUNCTION__));

11952

if (Mask[i] == SM_SentinelUndef)

11953

continue;

11954

if (Mask[i] == SM_SentinelZero) {

11955

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

11956

return false;

11957

RepeatedMask[i % LaneSize] = SM_SentinelZero;

11958

continue;

11959

}

11960

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11961

// This entry crosses lanes, so there is no way to model this shuffle.

11962

return false;

11963

11964

// Handle the in-lane shuffles by detecting if and when they repeat. Adjust

11965

// later vector indices to start at multiples of LaneSize instead of Size.

11966

int LaneM = Mask[i] / Size;

11967

int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

11968

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

11969

// This is the first non-undef entry in this slot of a 128-bit lane.

11970

RepeatedMask[i % LaneSize] = LocalM;

11971

else if (RepeatedMask[i % LaneSize] != LocalM)

11972

// Found a mismatch with the repeated mask.

11973

return false;

11974

}

11975

return true;

11976

}

11977

11978

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11979

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11980

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

11981

ArrayRef<int> Mask,

11982

SmallVectorImpl<int> &RepeatedMask) {

11983

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

11984

Mask, RepeatedMask);

11985

}

11986

11987

/// Checks whether the vector elements referenced by two shuffle masks are

11988

/// equivalent.

11989

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

11990

int Idx, int ExpectedIdx) {

11991

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11992, __extension__
__PRETTY_FUNCTION__))

11992

ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11992, __extension__
__PRETTY_FUNCTION__));

11993

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

11994

return false;

11995

11996

switch (Op.getOpcode()) {

11997

case ISD::BUILD_VECTOR:

11998

// If the values are build vectors, we can look through them to find

11999

// equivalent inputs that make the shuffles equivalent.

12000

// TODO: Handle MaskSize != Op.getNumOperands()?

12001

if (MaskSize == (int)Op.getNumOperands() &&

12002

MaskSize == (int)ExpectedOp.getNumOperands())

12003

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

12004

break;

12005

case X86ISD::VBROADCAST:

12006

case X86ISD::VBROADCAST_LOAD:

12007

// TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?

12008

return (Op == ExpectedOp &&

12009

(int)Op.getValueType().getVectorNumElements() == MaskSize);

12010

case X86ISD::HADD:

12011

case X86ISD::HSUB:

12012

case X86ISD::FHADD:

12013

case X86ISD::FHSUB:

12014

case X86ISD::PACKSS:

12015

case X86ISD::PACKUS:

12016

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

12017

// TODO: Handle MaskSize != NumElts?

12018

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

12019

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

12020

MVT VT = Op.getSimpleValueType();

12021

int NumElts = VT.getVectorNumElements();

12022

if (MaskSize == NumElts) {

12023

int NumLanes = VT.getSizeInBits() / 128;

12024

int NumEltsPerLane = NumElts / NumLanes;

12025

int NumHalfEltsPerLane = NumEltsPerLane / 2;

12026

bool SameLane =

12027

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

12028

bool SameElt =

12029

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

12030

return SameLane && SameElt;

12031

}

12032

}

12033

break;

12034

}

12035

12036

return false;

12037

}

12038

12039

/// Checks whether a shuffle mask is equivalent to an explicit list of

12040

/// arguments.

12041

///

12042

/// This is a fast way to test a shuffle mask against a fixed pattern:

12043

///

12044

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

12045

///

12046

/// It returns true if the mask is exactly as wide as the argument list, and

12047

/// each element of the mask is either -1 (signifying undef) or the value given

12048

/// in the argument.

12049

static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

12050

SDValue V1 = SDValue(),

12051

SDValue V2 = SDValue()) {

12052

int Size = Mask.size();

12053

if (Size != (int)ExpectedMask.size())

12054

return false;

12055

12056

for (int i = 0; i < Size; ++i) {

12057

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12057, __extension__
__PRETTY_FUNCTION__));

12058

int MaskIdx = Mask[i];

12059

int ExpectedIdx = ExpectedMask[i];

12060

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

12061

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12062

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12063

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12064

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12065

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12066

return false;

12067

}

12068

}

12069

return true;

12070

}

12071

12072

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

12073

///

12074

/// The masks must be exactly the same width.

12075

///

12076

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

12077

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

12078

///

12079

/// SM_SentinelZero is accepted as a valid negative index but must match in

12080

/// both, or via a known bits test.

12081

static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

12082

ArrayRef<int> ExpectedMask,

12083

const SelectionDAG &DAG,

12084

SDValue V1 = SDValue(),

12085

SDValue V2 = SDValue()) {

12086

int Size = Mask.size();

12087

if (Size != (int)ExpectedMask.size())

12088

return false;

12089

assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12091, __extension__
__PRETTY_FUNCTION__))

12090

[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12091, __extension__
__PRETTY_FUNCTION__))

12091

"Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12091, __extension__
__PRETTY_FUNCTION__));

12092

12093

// Check for out-of-range target shuffle mask indices.

12094

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

12095

return false;

12096

12097

// Don't use V1/V2 if they're not the same size as the shuffle mask type.

12098

if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())

12099

V1 = SDValue();

12100

if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())

12101

V2 = SDValue();

12102

12103

APInt ZeroV1 = APInt::getZero(Size);

12104

APInt ZeroV2 = APInt::getZero(Size);

12105

12106

for (int i = 0; i < Size; ++i) {

12107

int MaskIdx = Mask[i];

12108

int ExpectedIdx = ExpectedMask[i];

12109

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

12110

continue;

12111

if (MaskIdx == SM_SentinelZero) {

12112

// If we need this expected index to be a zero element, then update the

12113

// relevant zero mask and perform the known bits at the end to minimize

12114

// repeated computes.

12115

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12116

if (ExpectedV &&

12117

Size == (int)ExpectedV.getValueType().getVectorNumElements()) {

12118

int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12119

APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;

12120

ZeroMask.setBit(BitIdx);

12121

continue;

12122

}

12123

}

12124

if (MaskIdx >= 0) {

12125

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12126

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12127

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12128

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12129

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12130

continue;

12131

}

12132

return false;

12133

}

12134

return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&

12135

(ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));

12136

}

12137

12138

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

12139

// instructions.

12140

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,

12141

const SelectionDAG &DAG) {

12142

if (VT != MVT::v8i32 && VT != MVT::v8f32)

12143

return false;

12144

12145

SmallVector<int, 8> Unpcklwd;

12146

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

12147

/* Unary = */ false);

12148

SmallVector<int, 8> Unpckhwd;

12149

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

12150

/* Unary = */ false);

12151

bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||

12152

isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));

12153

return IsUnpackwdMask;

12154

}

12155

12156

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,

12157

const SelectionDAG &DAG) {

12158

// Create 128-bit vector type based on mask size.

12159

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

12160

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

12161

12162

// We can't assume a canonical shuffle mask, so try the commuted version too.

12163

SmallVector<int, 4> CommutedMask(Mask);

12164

ShuffleVectorSDNode::commuteMask(CommutedMask);

12165

12166

// Match any of unary/binary or low/high.

12167

for (unsigned i = 0; i != 4; ++i) {

12168

SmallVector<int, 16> UnpackMask;

12169

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

12170

if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||

12171

isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))

12172

return true;

12173

}

12174

return false;

12175

}

12176

12177

/// Return true if a shuffle mask chooses elements identically in its top and

12178

/// bottom halves. For example, any splat mask has the same top and bottom

12179

/// halves. If an element is undefined in only one half of the mask, the halves

12180

/// are not considered identical.

12181

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

12182

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12182, __extension__
__PRETTY_FUNCTION__));

12183

unsigned HalfSize = Mask.size() / 2;

12184

for (unsigned i = 0; i != HalfSize; ++i) {

12185

if (Mask[i] != Mask[i + HalfSize])

12186

return false;

12187

}

12188

return true;

12189

}

12190

12191

/// Get a 4-lane 8-bit shuffle immediate for a mask.

12192

///

12193

/// This helper function produces an 8-bit shuffle immediate corresponding to

12194

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

12195

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

12196

/// example.

12197

///

12198

/// NB: We rely heavily on "undef" masks preserving the input lane.

12199

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

12200

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12200, __extension__
__PRETTY_FUNCTION__));

12201

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12201, __extension__
__PRETTY_FUNCTION__));

12202

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12202, __extension__
__PRETTY_FUNCTION__));

12203

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12203, __extension__
__PRETTY_FUNCTION__));

12204

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12204, __extension__
__PRETTY_FUNCTION__));

12205

12206

// If the mask only uses one non-undef element, then fully 'splat' it to

12207

// improve later broadcast matching.

12208

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

12209

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12209, __extension__
__PRETTY_FUNCTION__));

12210

12211

int FirstElt = Mask[FirstIndex];

12212

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

12213

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

12214

12215

unsigned Imm = 0;

12216

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

12217

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

12218

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

12219

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

12220

return Imm;

12221

}

12222

12223

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

12224

SelectionDAG &DAG) {

12225

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

12226

}

12227

12228

// The Shuffle result is as follow:

12229

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

12230

// Each Zeroable's element correspond to a particular Mask's element.

12231

// As described in computeZeroableShuffleElements function.

12232

//

12233

// The function looks for a sub-mask that the nonzero elements are in

12234

// increasing order. If such sub-mask exist. The function returns true.

12235

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

12236

ArrayRef<int> Mask, const EVT &VectorType,

12237

bool &IsZeroSideLeft) {

12238

int NextElement = -1;

12239

// Check if the Mask's nonzero elements are in increasing order.

12240

for (int i = 0, e = Mask.size(); i < e; i++) {

12241

// Checks if the mask's zeros elements are built from only zeros.

12242

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12242, __extension__
__PRETTY_FUNCTION__));

12243

if (Mask[i] < 0)

12244

return false;

12245

if (Zeroable[i])

12246

continue;

12247

// Find the lowest non zero element

12248

if (NextElement < 0) {

12249

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

12250

IsZeroSideLeft = NextElement != 0;

12251

}

12252

// Exit if the mask's non zero elements are not in increasing order.

12253

if (NextElement != Mask[i])

12254

return false;

12255

NextElement++;

12256

}

12257

return true;

12258

}

12259

12260

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

12261

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

12262

ArrayRef<int> Mask, SDValue V1,

12263

SDValue V2, const APInt &Zeroable,

12264

const X86Subtarget &Subtarget,

12265

SelectionDAG &DAG) {

12266

int Size = Mask.size();

12267

int LaneSize = 128 / VT.getScalarSizeInBits();

12268

const int NumBytes = VT.getSizeInBits() / 8;

12269

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

12270

12271

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12273, __extension__
__PRETTY_FUNCTION__))

12272

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12273, __extension__
__PRETTY_FUNCTION__))

12273

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12273, __extension__
__PRETTY_FUNCTION__));

12274

12275

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

12276

// Sign bit set in i8 mask means zero element.

12277

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

12278

12279

SDValue V;

12280

for (int i = 0; i < NumBytes; ++i) {

12281

int M = Mask[i / NumEltBytes];

12282

if (M < 0) {

12283

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

12284

continue;

12285

}

12286

if (Zeroable[i / NumEltBytes]) {

12287

PSHUFBMask[i] = ZeroMask;

12288

continue;

12289

}

12290

12291

// We can only use a single input of V1 or V2.

12292

SDValue SrcV = (M >= Size ? V2 : V1);

12293

if (V && V != SrcV)

12294

return SDValue();

12295

V = SrcV;

12296

M %= Size;

12297

12298

// PSHUFB can't cross lanes, ensure this doesn't happen.

12299

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

12300

return SDValue();

12301

12302

M = M % LaneSize;

12303

M = M * NumEltBytes + (i % NumEltBytes);

12304

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

12305

}

12306

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12306, __extension__
__PRETTY_FUNCTION__));

12307

12308

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

12309

return DAG.getBitcast(

12310

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

12311

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

12312

}

12313

12314

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

12315

const X86Subtarget &Subtarget, SelectionDAG &DAG,

12316

const SDLoc &dl);

12317

12318

// X86 has dedicated shuffle that can be lowered to VEXPAND

12319

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

12320

const APInt &Zeroable,

12321

ArrayRef<int> Mask, SDValue &V1,

12322

SDValue &V2, SelectionDAG &DAG,

12323

const X86Subtarget &Subtarget) {

12324

bool IsLeftZeroSide = true;

12325

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

12326

IsLeftZeroSide))

12327

return SDValue();

12328

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

12329

MVT IntegerType =

12330

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

12331

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

12332

unsigned NumElts = VT.getVectorNumElements();

12333

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12334, __extension__
__PRETTY_FUNCTION__))

12334

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12334, __extension__
__PRETTY_FUNCTION__));

12335

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

12336

Subtarget, DAG, DL);

12337

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

12338

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

12339

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

12340

}

12341

12342

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

12343

unsigned &UnpackOpcode, bool IsUnary,

12344

ArrayRef<int> TargetMask, const SDLoc &DL,

12345

SelectionDAG &DAG,

12346

const X86Subtarget &Subtarget) {

12347

int NumElts = VT.getVectorNumElements();

12348

12349

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

12350

for (int i = 0; i != NumElts; i += 2) {

12351

int M1 = TargetMask[i + 0];

12352

int M2 = TargetMask[i + 1];

12353

Undef1 &= (SM_SentinelUndef == M1);

12354

Undef2 &= (SM_SentinelUndef == M2);

12355

Zero1 &= isUndefOrZero(M1);

12356

Zero2 &= isUndefOrZero(M2);

12357

}

12358

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12359, __extension__
__PRETTY_FUNCTION__))

12359

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12359, __extension__
__PRETTY_FUNCTION__));

12360

12361

// Attempt to match the target mask against the unpack lo/hi mask patterns.

12362

SmallVector<int, 64> Unpckl, Unpckh;

12363

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

12364

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,

12365

(IsUnary ? V1 : V2))) {

12366

UnpackOpcode = X86ISD::UNPCKL;

12367

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12368

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12369

return true;

12370

}

12371

12372

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

12373

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,

12374

(IsUnary ? V1 : V2))) {

12375

UnpackOpcode = X86ISD::UNPCKH;

12376

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12377

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12378

return true;

12379

}

12380

12381

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

12382

if (IsUnary && (Zero1 || Zero2)) {

12383

// Don't bother if we can blend instead.

12384

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

12385

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

12386

return false;

12387

12388

bool MatchLo = true, MatchHi = true;

12389

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

12390

int M = TargetMask[i];

12391

12392

// Ignore if the input is known to be zero or the index is undef.

12393

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

12394

(M == SM_SentinelUndef))

12395

continue;

12396

12397

MatchLo &= (M == Unpckl[i]);

12398

MatchHi &= (M == Unpckh[i]);

12399

}

12400

12401

if (MatchLo || MatchHi) {

12402

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12403

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12404

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12405

return true;

12406

}

12407

}

12408

12409

// If a binary shuffle, commute and try again.

12410

if (!IsUnary) {

12411

ShuffleVectorSDNode::commuteMask(Unpckl);

12412

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {

12413

UnpackOpcode = X86ISD::UNPCKL;

12414

std::swap(V1, V2);

12415

return true;

12416

}

12417

12418

ShuffleVectorSDNode::commuteMask(Unpckh);

12419

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {

12420

UnpackOpcode = X86ISD::UNPCKH;

12421

std::swap(V1, V2);

12422

return true;

12423

}

12424

}

12425

12426

return false;

12427

}

12428

12429

// X86 has dedicated unpack instructions that can handle specific blend

12430

// operations: UNPCKH and UNPCKL.

12431

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

12432

ArrayRef<int> Mask, SDValue V1, SDValue V2,

12433

SelectionDAG &DAG) {

12434

SmallVector<int, 8> Unpckl;

12435

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

12436

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12437

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

12438

12439

SmallVector<int, 8> Unpckh;

12440

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

12441

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12442

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

12443

12444

// Commute and try again.

12445

ShuffleVectorSDNode::commuteMask(Unpckl);

12446

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12447

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

12448

12449

ShuffleVectorSDNode::commuteMask(Unpckh);

12450

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12451

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

12452

12453

return SDValue();

12454

}

12455

12456

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

12457

/// followed by unpack 256-bit.

12458

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

12459

ArrayRef<int> Mask, SDValue V1,

12460

SDValue V2, SelectionDAG &DAG) {

12461

SmallVector<int, 32> Unpckl, Unpckh;

12462

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

12463

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

12464

12465

unsigned UnpackOpcode;

12466

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12467

UnpackOpcode = X86ISD::UNPCKL;

12468

else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12469

UnpackOpcode = X86ISD::UNPCKH;

12470

else

12471

return SDValue();

12472

12473

// This is a "natural" unpack operation (rather than the 128-bit sectored

12474

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

12475

// input in order to use the x86 instruction.

12476

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

12477

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

12478

V1 = DAG.getBitcast(VT, V1);

12479

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

12480

}

12481

12482

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

12483

// source into the lower elements and zeroing the upper elements.

12484

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

12485

ArrayRef<int> Mask, const APInt &Zeroable,

12486

const X86Subtarget &Subtarget) {

12487

if (!VT.is512BitVector() && !Subtarget.hasVLX())

12488

return false;

12489

12490

unsigned NumElts = Mask.size();

12491

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12492

unsigned MaxScale = 64 / EltSizeInBits;

12493

12494

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12495

unsigned SrcEltBits = EltSizeInBits * Scale;

12496

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12497

continue;

12498

unsigned NumSrcElts = NumElts / Scale;

12499

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

12500

continue;

12501

unsigned UpperElts = NumElts - NumSrcElts;

12502

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12503

continue;

12504

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

12505

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

12506

DstVT = MVT::getIntegerVT(EltSizeInBits);

12507

if ((NumSrcElts * EltSizeInBits) >= 128) {

12508

// ISD::TRUNCATE

12509

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

12510

} else {

12511

// X86ISD::VTRUNC

12512

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

12513

}

12514

return true;

12515

}

12516

12517

return false;

12518

}

12519

12520

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

12521

// element padding to the final DstVT.

12522

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

12523

const X86Subtarget &Subtarget,

12524

SelectionDAG &DAG, bool ZeroUppers) {

12525

MVT SrcVT = Src.getSimpleValueType();

12526

MVT DstSVT = DstVT.getScalarType();

12527

unsigned NumDstElts = DstVT.getVectorNumElements();

12528

unsigned NumSrcElts = SrcVT.getVectorNumElements();

12529

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

12530

12531

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

12532

return SDValue();

12533

12534

// Perform a direct ISD::TRUNCATE if possible.

12535

if (NumSrcElts == NumDstElts)

12536

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

12537

12538

if (NumSrcElts > NumDstElts) {

12539

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12540

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12541

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

12542

}

12543

12544

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

12545

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12546

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12547

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12548

DstVT.getSizeInBits());

12549

}

12550

12551

// Non-VLX targets must truncate from a 512-bit type, so we need to

12552

// widen, truncate and then possibly extract the original subvector.

12553

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

12554

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

12555

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

12556

}

12557

12558

// Fallback to a X86ISD::VTRUNC, padding if necessary.

12559

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

12560

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

12561

if (DstVT != TruncVT)

12562

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12563

DstVT.getSizeInBits());

12564

return Trunc;

12565

}

12566

12567

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

12568

//

12569

// An example is the following:

12570

//

12571

// t0: ch = EntryToken

12572

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

12573

// t25: v4i32 = truncate t2

12574

// t41: v8i16 = bitcast t25

12575

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

12576

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

12577

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

12578

// t18: v2i64 = bitcast t51

12579

//

12580

// One can just use a single vpmovdw instruction, without avx512vl we need to

12581

// use the zmm variant and extract the lower subvector, padding with zeroes.

12582

// TODO: Merge with lowerShuffleAsVTRUNC.

12583

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

12584

SDValue V2, ArrayRef<int> Mask,

12585

const APInt &Zeroable,

12586

const X86Subtarget &Subtarget,

12587

SelectionDAG &DAG) {

12588

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12588, __extension__
__PRETTY_FUNCTION__));

12589

if (!Subtarget.hasAVX512())

12590

return SDValue();

12591

12592

unsigned NumElts = VT.getVectorNumElements();

12593

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12594

unsigned MaxScale = 64 / EltSizeInBits;

12595

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12596

unsigned SrcEltBits = EltSizeInBits * Scale;

12597

unsigned NumSrcElts = NumElts / Scale;

12598

unsigned UpperElts = NumElts - NumSrcElts;

12599

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12600

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12601

continue;

12602

12603

// Attempt to find a matching source truncation, but as a fall back VLX

12604

// cases can use the VPMOV directly.

12605

SDValue Src = peekThroughBitcasts(V1);

12606

if (Src.getOpcode() == ISD::TRUNCATE &&

12607

Src.getScalarValueSizeInBits() == SrcEltBits) {

12608

Src = Src.getOperand(0);

12609

} else if (Subtarget.hasVLX()) {

12610

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12611

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12612

Src = DAG.getBitcast(SrcVT, Src);

12613

// Don't do this if PACKSS/PACKUS could perform it cheaper.

12614

if (Scale == 2 &&

12615

((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||

12616

(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))

12617

return SDValue();

12618

} else

12619

return SDValue();

12620

12621

// VPMOVWB is only available with avx512bw.

12622

if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)

12623

return SDValue();

12624

12625

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

12626

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12627

}

12628

12629

return SDValue();

12630

}

12631

12632

// Attempt to match binary shuffle patterns as a truncate.

12633

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

12634

SDValue V2, ArrayRef<int> Mask,

12635

const APInt &Zeroable,

12636

const X86Subtarget &Subtarget,

12637

SelectionDAG &DAG) {

12638

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12639, __extension__
__PRETTY_FUNCTION__))

12639

"Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12639, __extension__
__PRETTY_FUNCTION__));

12640

if (!Subtarget.hasAVX512())

12641

return SDValue();

12642

12643

unsigned NumElts = VT.getVectorNumElements();

12644

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12645

unsigned MaxScale = 64 / EltSizeInBits;

12646

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12647

// TODO: Support non-BWI VPMOVWB truncations?

12648

unsigned SrcEltBits = EltSizeInBits * Scale;

12649

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12650

continue;

12651

12652

// Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>

12653

// Bail if the V2 elements are undef.

12654

unsigned NumHalfSrcElts = NumElts / Scale;

12655

unsigned NumSrcElts = 2 * NumHalfSrcElts;

12656

for (unsigned Offset = 0; Offset != Scale; ++Offset) {

12657

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||

12658

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

12659

continue;

12660

12661

// The elements beyond the truncation must be undef/zero.

12662

unsigned UpperElts = NumElts - NumSrcElts;

12663

if (UpperElts > 0 &&

12664

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12665

continue;

12666

bool UndefUppers =

12667

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

12668

12669

// For offset truncations, ensure that the concat is cheap.

12670

if (Offset) {

12671

auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {

12672

if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

12673

Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)

12674

return Lo.getOperand(0) == Hi.getOperand(0);

12675

if (ISD::isNormalLoad(Lo.getNode()) &&

12676

ISD::isNormalLoad(Hi.getNode())) {

12677

auto *LDLo = cast<LoadSDNode>(Lo);

12678

auto *LDHi = cast<LoadSDNode>(Hi);

12679

return DAG.areNonVolatileConsecutiveLoads(

12680

LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);

12681

}

12682

return false;

12683

};

12684

if (!IsCheapConcat(V1, V2))

12685

continue;

12686

}

12687

12688

// As we're using both sources then we need to concat them together

12689

// and truncate from the double-sized src.

12690

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

12691

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

12692

12693

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12694

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12695

Src = DAG.getBitcast(SrcVT, Src);

12696

12697

// Shift the offset'd elements into place for the truncation.

12698

// TODO: Use getTargetVShiftByConstNode.

12699

if (Offset)

12700

Src = DAG.getNode(

12701

X86ISD::VSRLI, DL, SrcVT, Src,

12702

DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));

12703

12704

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12705

}

12706

}

12707

12708

return SDValue();

12709

}

12710

12711

/// Check whether a compaction lowering can be done by dropping even/odd

12712

/// elements and compute how many times even/odd elements must be dropped.

12713

///

12714

/// This handles shuffles which take every Nth element where N is a power of

12715

/// two. Example shuffle masks:

12716

///

12717

/// (even)

12718

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

12719

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

12720

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

12721

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

12722

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

12723

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

12724

///

12725

/// (odd)

12726

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14

12727

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

12728

///

12729

/// Any of these lanes can of course be undef.

12730

///

12731

/// This routine only supports N <= 3.

12732

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

12733

/// for larger N.

12734

///

12735

/// \returns N above, or the number of times even/odd elements must be dropped

12736

/// if there is such a number. Otherwise returns zero.

12737

static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

12738

bool IsSingleInput) {

12739

// The modulus for the shuffle vector entries is based on whether this is

12740

// a single input or not.

12741

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

12742

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12743, __extension__
__PRETTY_FUNCTION__))

12743

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12743, __extension__
__PRETTY_FUNCTION__));

12744

12745

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

12746

int Offset = MatchEven ? 0 : 1;

12747

12748

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

12749

// and 2^3 simultaneously. This is because we may have ambiguity with

12750

// partially undef inputs.

12751

bool ViableForN[3] = {true, true, true};

12752

12753

for (int i = 0, e = Mask.size(); i < e; ++i) {

12754

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

12755

// want.

12756

if (Mask[i] < 0)

12757

continue;

12758

12759

bool IsAnyViable = false;

12760

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12761

if (ViableForN[j]) {

12762

uint64_t N = j + 1;

12763

12764

// The shuffle mask must be equal to (i * 2^N) % M.

12765

if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

12766

IsAnyViable = true;

12767

else

12768

ViableForN[j] = false;

12769

}

12770

// Early exit if we exhaust the possible powers of two.

12771

if (!IsAnyViable)

12772

break;

12773

}

12774

12775

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12776

if (ViableForN[j])

12777

return j + 1;

12778

12779

// Return 0 as there is no viable power of two.

12780

return 0;

12781

}

12782

12783

// X86 has dedicated pack instructions that can handle specific truncation

12784

// operations: PACKSS and PACKUS.

12785

// Checks for compaction shuffle masks if MaxStages > 1.

12786

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

12787

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

12788

unsigned &PackOpcode, ArrayRef<int> TargetMask,

12789

const SelectionDAG &DAG,

12790

const X86Subtarget &Subtarget,

12791

unsigned MaxStages = 1) {

12792

unsigned NumElts = VT.getVectorNumElements();

12793

unsigned BitSize = VT.getScalarSizeInBits();

12794

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12795, __extension__
__PRETTY_FUNCTION__))

12795

"Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12795, __extension__
__PRETTY_FUNCTION__));

12796

12797

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

12798

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

12799

unsigned NumPackedBits = NumSrcBits - BitSize;

12800

N1 = peekThroughBitcasts(N1);

12801

N2 = peekThroughBitcasts(N2);

12802

unsigned NumBits1 = N1.getScalarValueSizeInBits();

12803

unsigned NumBits2 = N2.getScalarValueSizeInBits();

12804

bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

12805

bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

12806

if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

12807

(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

12808

return false;

12809

if (Subtarget.hasSSE41() || BitSize == 8) {

12810

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

12811

if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

12812

(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

12813

V1 = N1;

12814

V2 = N2;

12815

SrcVT = PackVT;

12816

PackOpcode = X86ISD::PACKUS;

12817

return true;

12818

}

12819

}

12820

bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

12821

bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

12822

if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

12823

DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

12824

(N2.isUndef() || IsZero2 || IsAllOnes2 ||

12825

DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

12826

V1 = N1;

12827

V2 = N2;

12828

SrcVT = PackVT;

12829

PackOpcode = X86ISD::PACKSS;

12830

return true;

12831

}

12832

return false;

12833

};

12834

12835

// Attempt to match against wider and wider compaction patterns.

12836

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

12837

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

12838

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

12839

12840

// Try binary shuffle.

12841

SmallVector<int, 32> BinaryMask;

12842

createPackShuffleMask(VT, BinaryMask, false, NumStages);

12843

if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))

12844

if (MatchPACK(V1, V2, PackVT))

12845

return true;

12846

12847

// Try unary shuffle.

12848

SmallVector<int, 32> UnaryMask;

12849

createPackShuffleMask(VT, UnaryMask, true, NumStages);

12850

if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))

12851

if (MatchPACK(V1, V1, PackVT))

12852

return true;

12853

}

12854

12855

return false;

12856

}

12857

12858

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

12859

SDValue V1, SDValue V2, SelectionDAG &DAG,

12860

const X86Subtarget &Subtarget) {

12861

MVT PackVT;

12862

unsigned PackOpcode;

12863

unsigned SizeBits = VT.getSizeInBits();

12864

unsigned EltBits = VT.getScalarSizeInBits();

12865

unsigned MaxStages = Log2_32(64 / EltBits);

12866

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

12867

Subtarget, MaxStages))

12868

return SDValue();

12869

12870

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

12871

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

12872

12873

// Don't lower multi-stage packs on AVX512, truncation is better.

12874

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

12875

return SDValue();

12876

12877

// Pack to the largest type possible:

12878

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

12879

unsigned MaxPackBits = 16;

12880

if (CurrentEltBits > 16 &&

12881

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

12882

MaxPackBits = 32;

12883

12884

// Repeatedly pack down to the target size.

12885

SDValue Res;

12886

for (unsigned i = 0; i != NumStages; ++i) {

12887

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

12888

unsigned NumSrcElts = SizeBits / SrcEltBits;

12889

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12890

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

12891

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12892

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

12893

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

12894

DAG.getBitcast(SrcVT, V2));

12895

V1 = V2 = Res;

12896

CurrentEltBits /= 2;

12897

}

12898

assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12899, __extension__
__PRETTY_FUNCTION__))

12899

"Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12899, __extension__
__PRETTY_FUNCTION__));

12900

return Res;

12901

}

12902

12903

/// Try to emit a bitmask instruction for a shuffle.

12904

///

12905

/// This handles cases where we can model a blend exactly as a bitmask due to

12906

/// one of the inputs being zeroable.

12907

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

12908

SDValue V2, ArrayRef<int> Mask,

12909

const APInt &Zeroable,

12910

const X86Subtarget &Subtarget,

12911

SelectionDAG &DAG) {

12912

MVT MaskVT = VT;

12913

MVT EltVT = VT.getVectorElementType();

12914

SDValue Zero, AllOnes;

12915

// Use f64 if i64 isn't legal.

12916

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

12917

EltVT = MVT::f64;

12918

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

12919

}

12920

12921

MVT LogicVT = VT;

12922

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

12923

Zero = DAG.getConstantFP(0.0, DL, EltVT);

12924

APFloat AllOnesValue =

12925

APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));

12926

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

12927

LogicVT =

12928

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

12929

} else {

12930

Zero = DAG.getConstant(0, DL, EltVT);

12931

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12932

}

12933

12934

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

12935

SDValue V;

12936

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12937

if (Zeroable[i])

12938

continue;

12939

if (Mask[i] % Size != i)

12940

return SDValue(); // Not a blend.

12941

if (!V)

12942

V = Mask[i] < Size ? V1 : V2;

12943

else if (V != (Mask[i] < Size ? V1 : V2))

12944

return SDValue(); // Can only let one input through the mask.

12945

12946

VMaskOps[i] = AllOnes;

12947

}

12948

if (!V)

12949

return SDValue(); // No non-zeroable elements!

12950

12951

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

12952

VMask = DAG.getBitcast(LogicVT, VMask);

12953

V = DAG.getBitcast(LogicVT, V);

12954

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

12955

return DAG.getBitcast(VT, And);

12956

}

12957

12958

/// Try to emit a blend instruction for a shuffle using bit math.

12959

///

12960

/// This is used as a fallback approach when first class blend instructions are

12961

/// unavailable. Currently it is only suitable for integer vectors, but could

12962

/// be generalized for floating point vectors if desirable.

12963

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

12964

SDValue V2, ArrayRef<int> Mask,

12965

SelectionDAG &DAG) {

12966

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12966, __extension__
__PRETTY_FUNCTION__));

12967

MVT EltVT = VT.getVectorElementType();

12968

SDValue Zero = DAG.getConstant(0, DL, EltVT);

12969

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12970

SmallVector<SDValue, 16> MaskOps;

12971

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12972

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

12973

return SDValue(); // Shuffled input!

12974

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

12975

}

12976

12977

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

12978

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

12979

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

12980

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

12981

}

12982

12983

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

12984

SDValue PreservedSrc,

12985

const X86Subtarget &Subtarget,

12986

SelectionDAG &DAG);

12987

12988

static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,

12989

MutableArrayRef<int> Mask,

12990

const APInt &Zeroable, bool &ForceV1Zero,

12991

bool &ForceV2Zero, uint64_t &BlendMask) {

12992

bool V1IsZeroOrUndef =

12993

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

12994

bool V2IsZeroOrUndef =

12995

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

12996

12997

BlendMask = 0;

12998

ForceV1Zero = false, ForceV2Zero = false;

12999

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12999, __extension__
__PRETTY_FUNCTION__));

13000

13001

int NumElts = Mask.size();

13002

int NumLanes = VT.getSizeInBits() / 128;

13003

int NumEltsPerLane = NumElts / NumLanes;

13004

assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13004, __extension__
__PRETTY_FUNCTION__));

13005

13006

// For 32/64-bit elements, if we only reference one input (plus any undefs),

13007

// then ensure the blend mask part for that lane just references that input.

13008

bool ForceWholeLaneMasks =

13009

VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;

13010

13011

// Attempt to generate the binary blend mask. If an input is zero then

13012

// we can use any lane.

13013

for (int Lane = 0; Lane != NumLanes; ++Lane) {

13014

// Keep track of the inputs used per lane.

13015

bool LaneV1InUse = false;

13016

bool LaneV2InUse = false;

13017

uint64_t LaneBlendMask = 0;

13018

for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {

13019

int Elt = (Lane * NumEltsPerLane) + LaneElt;

13020

int M = Mask[Elt];

13021

if (M == SM_SentinelUndef)

13022

continue;

13023

if (M == Elt || (0 <= M && M < NumElts &&

13024

IsElementEquivalent(NumElts, V1, V1, M, Elt))) {

13025

Mask[Elt] = Elt;

13026

LaneV1InUse = true;

13027

continue;

13028

}

13029

if (M == (Elt + NumElts) ||

13030

(NumElts <= M &&

13031

IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {

13032

LaneBlendMask |= 1ull << LaneElt;

13033

Mask[Elt] = Elt + NumElts;

13034

LaneV2InUse = true;

13035

continue;

13036

}

13037

if (Zeroable[Elt]) {

13038

if (V1IsZeroOrUndef) {

13039

ForceV1Zero = true;

13040

Mask[Elt] = Elt;

13041

LaneV1InUse = true;

13042

continue;

13043

}

13044

if (V2IsZeroOrUndef) {

13045

ForceV2Zero = true;

13046

LaneBlendMask |= 1ull << LaneElt;

13047

Mask[Elt] = Elt + NumElts;

13048

LaneV2InUse = true;

13049

continue;

13050

}

13051

}

13052

return false;

13053

}

13054

13055

// If we only used V2 then splat the lane blend mask to avoid any demanded

13056

// elts from V1 in this lane (the V1 equivalent is implicit with a zero

13057

// blend mask bit).

13058

if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)

13059

LaneBlendMask = (1ull << NumEltsPerLane) - 1;

13060

13061

BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);

13062

}

13063

return true;

13064

}

13065

13066

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

13067

int Scale) {

13068

uint64_t ScaledMask = 0;

13069

for (int i = 0; i != Size; ++i)

13070

if (BlendMask & (1ull << i))

13071

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

13072

return ScaledMask;

13073

}

13074

13075

/// Try to emit a blend instruction for a shuffle.

13076

///

13077

/// This doesn't do any checks for the availability of instructions for blending

13078

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

13079

/// be matched in the backend with the type given. What it does check for is

13080

/// that the shuffle mask is a blend, or convertible into a blend with zero.

13081

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

13082

SDValue V2, ArrayRef<int> Original,

13083

const APInt &Zeroable,

13084

const X86Subtarget &Subtarget,

13085

SelectionDAG &DAG) {

13086

uint64_t BlendMask = 0;

13087

bool ForceV1Zero = false, ForceV2Zero = false;

13088

SmallVector<int, 64> Mask(Original);

13089

if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

13090

BlendMask))

13091

return SDValue();

13092

13093

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

13094

if (ForceV1Zero)

13095

V1 = getZeroVector(VT, Subtarget, DAG, DL);

13096

if (ForceV2Zero)

13097

V2 = getZeroVector(VT, Subtarget, DAG, DL);

13098

13099

unsigned NumElts = VT.getVectorNumElements();

13100

13101

switch (VT.SimpleTy) {

13102

case MVT::v4i64:

13103

case MVT::v8i32:

13104

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13104, __extension__
__PRETTY_FUNCTION__));

13105

[[fallthrough]];

13106

case MVT::v4f64:

13107

case MVT::v8f32:

13108

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13108, __extension__
__PRETTY_FUNCTION__));

13109

[[fallthrough]];

13110

case MVT::v2f64:

13111

case MVT::v2i64:

13112

case MVT::v4f32:

13113

case MVT::v4i32:

13114

case MVT::v8i16:

13115

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13115, __extension__
__PRETTY_FUNCTION__));

13116

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

13117

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13118

case MVT::v16i16: {

13119

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13119, __extension__
__PRETTY_FUNCTION__));

13120

SmallVector<int, 8> RepeatedMask;

13121

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

13122

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

13123

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13123, __extension__
__PRETTY_FUNCTION__));

13124

BlendMask = 0;

13125

for (int i = 0; i < 8; ++i)

13126

if (RepeatedMask[i] >= 8)

13127

BlendMask |= 1ull << i;

13128

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13129

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13130

}

13131

// Use PBLENDW for lower/upper lanes and then blend lanes.

13132

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

13133

// merge to VSELECT where useful.

13134

uint64_t LoMask = BlendMask & 0xFF;

13135

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

13136

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

13137

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13138

DAG.getTargetConstant(LoMask, DL, MVT::i8));

13139

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13140

DAG.getTargetConstant(HiMask, DL, MVT::i8));

13141

return DAG.getVectorShuffle(

13142

MVT::v16i16, DL, Lo, Hi,

13143

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

13144

}

13145

[[fallthrough]];

13146

}

13147

case MVT::v32i8:

13148

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13148, __extension__
__PRETTY_FUNCTION__));

13149

[[fallthrough]];

13150

case MVT::v16i8: {

13151

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13151, __extension__
__PRETTY_FUNCTION__));

13152

13153

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

13154

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13155

Subtarget, DAG))

13156

return Masked;

13157

13158

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

13159

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13160

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13161

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13162

}

13163

13164

// If we have VPTERNLOG, we can use that as a bit blend.

13165

if (Subtarget.hasVLX())

13166

if (SDValue BitBlend =

13167

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

13168

return BitBlend;

13169

13170

// Scale the blend by the number of bytes per element.

13171

int Scale = VT.getScalarSizeInBits() / 8;

13172

13173

// This form of blend is always done on bytes. Compute the byte vector

13174

// type.

13175

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13176

13177

// x86 allows load folding with blendvb from the 2nd source operand. But

13178

// we are still using LLVM select here (see comment below), so that's V1.

13179

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

13180

// allow that load-folding possibility.

13181

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

13182

ShuffleVectorSDNode::commuteMask(Mask);

13183

std::swap(V1, V2);

13184

}

13185

13186

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

13187

// mix of LLVM's code generator and the x86 backend. We tell the code

13188

// generator that boolean values in the elements of an x86 vector register

13189

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

13190

// mapping a select to operand #1, and 'false' mapping to operand #2. The

13191

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

13192

// of the element (the remaining are ignored) and 0 in that high bit would

13193

// mean operand #1 while 1 in the high bit would mean operand #2. So while

13194

// the LLVM model for boolean values in vector elements gets the relevant

13195

// bit set, it is set backwards and over constrained relative to x86's

13196

// actual model.

13197

SmallVector<SDValue, 32> VSELECTMask;

13198

for (int i = 0, Size = Mask.size(); i < Size; ++i)

13199

for (int j = 0; j < Scale; ++j)

13200

VSELECTMask.push_back(

13201

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

13202

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

13203

MVT::i8));

13204

13205

V1 = DAG.getBitcast(BlendVT, V1);

13206

V2 = DAG.getBitcast(BlendVT, V2);

13207

return DAG.getBitcast(

13208

VT,

13209

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

13210

V1, V2));

13211

}

13212

case MVT::v16f32:

13213

case MVT::v8f64:

13214

case MVT::v8i64:

13215

case MVT::v16i32:

13216

case MVT::v32i16:

13217

case MVT::v64i8: {

13218

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

13219

bool OptForSize = DAG.shouldOptForSize();

13220

if (!OptForSize) {

13221

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13222

Subtarget, DAG))

13223

return Masked;

13224

}

13225

13226

// Otherwise load an immediate into a GPR, cast to k-register, and use a

13227

// masked move.

13228

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13229

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13230

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13231

}

13232

default:

13233

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13233);

13234

}

13235

}

13236

13237

/// Try to lower as a blend of elements from two inputs followed by

13238

/// a single-input permutation.

13239

///

13240

/// This matches the pattern where we can blend elements from two inputs and

13241

/// then reduce the shuffle to a single-input permutation.

13242

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

13243

SDValue V1, SDValue V2,

13244

ArrayRef<int> Mask,

13245

SelectionDAG &DAG,

13246

bool ImmBlends = false) {

13247

// We build up the blend mask while checking whether a blend is a viable way

13248

// to reduce the shuffle.

13249

SmallVector<int, 32> BlendMask(Mask.size(), -1);

13250

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

13251

13252

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

13253

if (Mask[i] < 0)

13254

continue;

13255

13256

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13256, __extension__
__PRETTY_FUNCTION__));

13257

13258

if (BlendMask[Mask[i] % Size] < 0)

13259

BlendMask[Mask[i] % Size] = Mask[i];

13260

else if (BlendMask[Mask[i] % Size] != Mask[i])

13261

return SDValue(); // Can't blend in the needed input!

13262

13263

PermuteMask[i] = Mask[i] % Size;

13264

}

13265

13266

// If only immediate blends, then bail if the blend mask can't be widened to

13267

// i16.

13268

unsigned EltSize = VT.getScalarSizeInBits();

13269

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

13270

return SDValue();

13271

13272

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

13273

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

13274

}

13275

13276

/// Try to lower as an unpack of elements from two inputs followed by

13277

/// a single-input permutation.

13278

///

13279

/// This matches the pattern where we can unpack elements from two inputs and

13280

/// then reduce the shuffle to a single-input (wider) permutation.

13281

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

13282

SDValue V1, SDValue V2,

13283

ArrayRef<int> Mask,

13284

SelectionDAG &DAG) {

13285

int NumElts = Mask.size();

13286

int NumLanes = VT.getSizeInBits() / 128;

13287

int NumLaneElts = NumElts / NumLanes;

13288

int NumHalfLaneElts = NumLaneElts / 2;

13289

13290

bool MatchLo = true, MatchHi = true;

13291

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

13292

13293

// Determine UNPCKL/UNPCKH type and operand order.

13294

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13295

for (int Elt = 0; Elt != NumLaneElts; ++Elt) {

13296

int M = Mask[Lane + Elt];

13297

if (M < 0)

13298

continue;

13299

13300

SDValue &Op = Ops[Elt & 1];

13301

if (M < NumElts && (Op.isUndef() || Op == V1))

13302

Op = V1;

13303

else if (NumElts <= M && (Op.isUndef() || Op == V2))

13304

Op = V2;

13305

else

13306

return SDValue();

13307

13308

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

13309

MatchLo &= isUndefOrInRange(M, Lo, Mid) ||

13310

isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);

13311

MatchHi &= isUndefOrInRange(M, Mid, Hi) ||

13312

isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);

13313

if (!MatchLo && !MatchHi)

13314

return SDValue();

13315

}

13316

}

13317

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13317, __extension__
__PRETTY_FUNCTION__));

13318

13319

// Now check that each pair of elts come from the same unpack pair

13320

// and set the permute mask based on each pair.

13321

// TODO - Investigate cases where we permute individual elements.

13322

SmallVector<int, 32> PermuteMask(NumElts, -1);

13323

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13324

for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {

13325

int M0 = Mask[Lane + Elt + 0];

13326

int M1 = Mask[Lane + Elt + 1];

13327

if (0 <= M0 && 0 <= M1 &&

13328

(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))

13329

return SDValue();

13330

if (0 <= M0)

13331

PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));

13332

if (0 <= M1)

13333

PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;

13334

}

13335

}

13336

13337

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

13338

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

13339

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

13340

}

13341

13342

/// Try to lower a shuffle as a permute of the inputs followed by an

13343

/// UNPCK instruction.

13344

///

13345

/// This specifically targets cases where we end up with alternating between

13346

/// the two inputs, and so can permute them into something that feeds a single

13347

/// UNPCK instruction. Note that this routine only targets integer vectors

13348

/// because for floating point vectors we have a generalized SHUFPS lowering

13349

/// strategy that handles everything that doesn't *exactly* match an unpack,

13350

/// making this clever lowering unnecessary.

13351

static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

13352

SDValue V1, SDValue V2,

13353

ArrayRef<int> Mask,

13354

const X86Subtarget &Subtarget,

13355

SelectionDAG &DAG) {

13356

int Size = Mask.size();

13357

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13357, __extension__
__PRETTY_FUNCTION__));

13358

13359

// This routine only supports 128-bit integer dual input vectors.

13360

if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())

13361

return SDValue();

13362

13363

int NumLoInputs =

13364

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

13365

int NumHiInputs =

13366

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

13367

13368

bool UnpackLo = NumLoInputs >= NumHiInputs;

13369

13370

auto TryUnpack = [&](int ScalarSize, int Scale) {

13371

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

13372

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

13373

13374

for (int i = 0; i < Size; ++i) {

13375

if (Mask[i] < 0)

13376

continue;

13377

13378

// Each element of the unpack contains Scale elements from this mask.

13379

int UnpackIdx = i / Scale;

13380

13381

// We only handle the case where V1 feeds the first slots of the unpack.

13382

// We rely on canonicalization to ensure this is the case.

13383

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

13384

return SDValue();

13385

13386

// Setup the mask for this input. The indexing is tricky as we have to

13387

// handle the unpack stride.

13388

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

13389

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

13390

Mask[i] % Size;

13391

}

13392

13393

// If we will have to shuffle both inputs to use the unpack, check whether

13394

// we can just unpack first and shuffle the result. If so, skip this unpack.

13395

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

13396

!isNoopShuffleMask(V2Mask))

13397

return SDValue();

13398

13399

// Shuffle the inputs into place.

13400

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13401

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13402

13403

// Cast the inputs to the type we will use to unpack them.

13404

MVT UnpackVT =

13405

MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

13406

V1 = DAG.getBitcast(UnpackVT, V1);

13407

V2 = DAG.getBitcast(UnpackVT, V2);

13408

13409

// Unpack the inputs and cast the result back to the desired type.

13410

return DAG.getBitcast(

13411

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

13412

UnpackVT, V1, V2));

13413

};

13414

13415

// We try each unpack from the largest to the smallest to try and find one

13416

// that fits this mask.

13417

int OrigScalarSize = VT.getScalarSizeInBits();

13418

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

13419

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

13420

return Unpack;

13421

13422

// If we're shuffling with a zero vector then we're better off not doing

13423

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

13424

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

13425

ISD::isBuildVectorAllZeros(V2.getNode()))

13426

return SDValue();

13427

13428

// If none of the unpack-rooted lowerings worked (or were profitable) try an

13429

// initial unpack.

13430

if (NumLoInputs == 0 || NumHiInputs == 0) {

13431

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13432, __extension__
__PRETTY_FUNCTION__))

13432

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13432, __extension__
__PRETTY_FUNCTION__));

13433

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

13434

13435

// FIXME: We could consider the total complexity of the permute of each

13436

// possible unpacking. Or at the least we should consider how many

13437

// half-crossings are created.

13438

// FIXME: We could consider commuting the unpacks.

13439

13440

SmallVector<int, 32> PermMask((unsigned)Size, -1);

13441

for (int i = 0; i < Size; ++i) {

13442

if (Mask[i] < 0)

13443

continue;

13444

13445

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13445, __extension__
__PRETTY_FUNCTION__));

13446

13447

PermMask[i] =

13448

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

13449

}

13450

return DAG.getVectorShuffle(

13451

VT, DL,

13452

DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,

13453

V1, V2),

13454

DAG.getUNDEF(VT), PermMask);

13455

}

13456

13457

return SDValue();

13458

}

13459

13460

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

13461

/// permuting the elements of the result in place.

13462

static SDValue lowerShuffleAsByteRotateAndPermute(

13463

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13464

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13465

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

13466

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

13467

(VT.is512BitVector() && !Subtarget.hasBWI()))

13468

return SDValue();

13469

13470

// We don't currently support lane crossing permutes.

13471

if (is128BitLaneCrossingShuffleMask(VT, Mask))

13472

return SDValue();

13473

13474

int Scale = VT.getScalarSizeInBits() / 8;

13475

int NumLanes = VT.getSizeInBits() / 128;

13476

int NumElts = VT.getVectorNumElements();

13477

int NumEltsPerLane = NumElts / NumLanes;

13478

13479

// Determine range of mask elts.

13480

bool Blend1 = true;

13481

bool Blend2 = true;

13482

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13483

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13484

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13485

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13486

int M = Mask[Lane + Elt];

13487

if (M < 0)

13488

continue;

13489

if (M < NumElts) {

13490

Blend1 &= (M == (Lane + Elt));

13491

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13491, __extension__
__PRETTY_FUNCTION__));

13492

M = M % NumEltsPerLane;

13493

Range1.first = std::min(Range1.first, M);

13494

Range1.second = std::max(Range1.second, M);

13495

} else {

13496

M -= NumElts;

13497

Blend2 &= (M == (Lane + Elt));

13498

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13498, __extension__
__PRETTY_FUNCTION__));

13499

M = M % NumEltsPerLane;

13500

Range2.first = std::min(Range2.first, M);

13501

Range2.second = std::max(Range2.second, M);

13502

}

13503

}

13504

}

13505

13506

// Bail if we don't need both elements.

13507

// TODO - it might be worth doing this for unary shuffles if the permute

13508

// can be widened.

13509

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

13510

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

13511

return SDValue();

13512

13513

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

13514

return SDValue();

13515

13516

// Rotate the 2 ops so we can access both ranges, then permute the result.

13517

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

13518

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13519

SDValue Rotate = DAG.getBitcast(

13520

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

13521

DAG.getBitcast(ByteVT, Lo),

13522

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

13523

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

13524

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13525

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13526

int M = Mask[Lane + Elt];

13527

if (M < 0)

13528

continue;

13529

if (M < NumElts)

13530

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

13531

else

13532

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

13533

}

13534

}

13535

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

13536

};

13537

13538

// Check if the ranges are small enough to rotate from either direction.

13539

if (Range2.second < Range1.first)

13540

return RotateAndPermute(V1, V2, Range1.first, 0);

13541

if (Range1.second < Range2.first)

13542

return RotateAndPermute(V2, V1, Range2.first, NumElts);

13543

return SDValue();

13544

}

13545

13546

static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

13547

return isUndefOrEqual(Mask, 0);

13548

}

13549

13550

static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

13551

return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

13552

}

13553

13554

/// Generic routine to decompose a shuffle and blend into independent

13555

/// blends and permutes.

13556

///

13557

/// This matches the extremely common pattern for handling combined

13558

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

13559

/// operations. It will try to pick the best arrangement of shuffles and

13560

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

13561

static SDValue lowerShuffleAsDecomposedShuffleMerge(

13562

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13563

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13564

int NumElts = Mask.size();

13565

int NumLanes = VT.getSizeInBits() / 128;

13566

int NumEltsPerLane = NumElts / NumLanes;

13567

13568

// Shuffle the input elements into the desired positions in V1 and V2 and

13569

// unpack/blend them together.

13570

bool IsAlternating = true;

13571

SmallVector<int, 32> V1Mask(NumElts, -1);

13572

SmallVector<int, 32> V2Mask(NumElts, -1);

13573

SmallVector<int, 32> FinalMask(NumElts, -1);

13574

for (int i = 0; i < NumElts; ++i) {

13575

int M = Mask[i];

13576

if (M >= 0 && M < NumElts) {

13577

V1Mask[i] = M;

13578

FinalMask[i] = i;

13579

IsAlternating &= (i & 1) == 0;

13580

} else if (M >= NumElts) {

13581

V2Mask[i] = M - NumElts;

13582

FinalMask[i] = i + NumElts;

13583

IsAlternating &= (i & 1) == 1;

13584

}

13585

}

13586

13587

// If we effectively only demand the 0'th element of \p Input, and not only

13588

// as 0'th element, then broadcast said input,

13589

// and change \p InputMask to be a no-op (identity) mask.

13590

auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

13591

&DAG](SDValue &Input,

13592

MutableArrayRef<int> InputMask) {

13593

unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

13594

if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

13595

!X86::mayFoldLoad(Input, Subtarget)))

13596

return;

13597

if (isNoopShuffleMask(InputMask))

13598

return;

13599

assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13600, __extension__
__PRETTY_FUNCTION__))

13600

"Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13600, __extension__
__PRETTY_FUNCTION__));

13601

Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

13602

for (auto I : enumerate(InputMask)) {

13603

int &InputMaskElt = I.value();

13604

if (InputMaskElt >= 0)

13605

InputMaskElt = I.index();

13606

}

13607

};

13608

13609

// Currently, we may need to produce one shuffle per input, and blend results.

13610

// It is possible that the shuffle for one of the inputs is already a no-op.

13611

// See if we can simplify non-no-op shuffles into broadcasts,

13612

// which we consider to be strictly better than an arbitrary shuffle.

13613

if (isNoopOrBroadcastShuffleMask(V1Mask) &&

13614

isNoopOrBroadcastShuffleMask(V2Mask)) {

13615

canonicalizeBroadcastableInput(V1, V1Mask);

13616

canonicalizeBroadcastableInput(V2, V2Mask);

13617

}

13618

13619

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

13620

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

13621

// the shuffle may be able to fold with a load or other benefit. However, when

13622

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

13623

// pre-shuffle first is a better strategy.

13624

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

13625

// Only prefer immediate blends to unpack/rotate.

13626

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13627

DAG, true))

13628

return BlendPerm;

13629

if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,

13630

DAG))

13631

return UnpackPerm;

13632

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

13633

DL, VT, V1, V2, Mask, Subtarget, DAG))

13634

return RotatePerm;

13635

// Unpack/rotate failed - try again with variable blends.

13636

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13637

DAG))

13638

return BlendPerm;

13639

if (VT.getScalarSizeInBits() >= 32)

13640

if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

13641

DL, VT, V1, V2, Mask, Subtarget, DAG))

13642

return PermUnpack;

13643

}

13644

13645

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

13646

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

13647

// TODO: It doesn't have to be alternating - but each lane mustn't have more

13648

// than half the elements coming from each source.

13649

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

13650

V1Mask.assign(NumElts, -1);

13651

V2Mask.assign(NumElts, -1);

13652

FinalMask.assign(NumElts, -1);

13653

for (int i = 0; i != NumElts; i += NumEltsPerLane)

13654

for (int j = 0; j != NumEltsPerLane; ++j) {

13655

int M = Mask[i + j];

13656

if (M >= 0 && M < NumElts) {

13657

V1Mask[i + (j / 2)] = M;

13658

FinalMask[i + j] = i + (j / 2);

13659

} else if (M >= NumElts) {

13660

V2Mask[i + (j / 2)] = M - NumElts;

13661

FinalMask[i + j] = i + (j / 2) + NumElts;

13662

}

13663

}

13664

}

13665

13666

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13667

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13668

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

13669

}

13670

13671

/// Try to lower a vector shuffle as a bit rotation.

13672

///

13673

/// Look for a repeated rotation pattern in each sub group.

13674

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

13675

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

13676

int NumElts = Mask.size();

13677

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13677, __extension__
__PRETTY_FUNCTION__));

13678

13679

int RotateAmt = -1;

13680

for (int i = 0; i != NumElts; i += NumSubElts) {

13681

for (int j = 0; j != NumSubElts; ++j) {

13682

int M = Mask[i + j];

13683

if (M < 0)

13684

continue;

13685

if (!isInRange(M, i, i + NumSubElts))

13686

return -1;

13687

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

13688

if (0 <= RotateAmt && Offset != RotateAmt)

13689

return -1;

13690

RotateAmt = Offset;

13691

}

13692

}

13693

return RotateAmt;

13694

}

13695

13696

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

13697

const X86Subtarget &Subtarget,

13698

ArrayRef<int> Mask) {

13699

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13699, __extension__
__PRETTY_FUNCTION__));

13700

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13700, __extension__
__PRETTY_FUNCTION__));

13701

13702

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

13703

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

13704

int MaxSubElts = 64 / EltSizeInBits;

13705

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

13706

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

13707

if (RotateAmt < 0)

13708

continue;

13709

13710

int NumElts = Mask.size();

13711

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

13712

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

13713

return RotateAmt * EltSizeInBits;

13714

}

13715

13716

return -1;

13717

}

13718

13719

/// Lower shuffle using X86ISD::VROTLI rotations.

13720

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

13721

ArrayRef<int> Mask,

13722

const X86Subtarget &Subtarget,

13723

SelectionDAG &DAG) {

13724

// Only XOP + AVX512 targets have bit rotation instructions.

13725

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

13726

bool IsLegal =

13727

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

13728

if (!IsLegal && Subtarget.hasSSE3())

13729

return SDValue();

13730

13731

MVT RotateVT;

13732

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

13733

Subtarget, Mask);

13734

if (RotateAmt < 0)

13735

return SDValue();

13736

13737

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

13738

// expanded to OR(SRL,SHL), will be more efficient, but if they can

13739

// widen to vXi16 or more then existing lowering should will be better.

13740

if (!IsLegal) {

13741

if ((RotateAmt % 16) == 0)

13742

return SDValue();

13743

// TODO: Use getTargetVShiftByConstNode.

13744

unsigned ShlAmt = RotateAmt;

13745

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

13746

V1 = DAG.getBitcast(RotateVT, V1);

13747

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

13748

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

13749

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

13750

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

13751

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

13752

return DAG.getBitcast(VT, Rot);

13753

}

13754

13755

SDValue Rot =

13756

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

13757

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

13758

return DAG.getBitcast(VT, Rot);

13759

}

13760

13761

/// Try to match a vector shuffle as an element rotation.

13762

///

13763

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

13764

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

13765

ArrayRef<int> Mask) {

13766

int NumElts = Mask.size();

13767

13768

// We need to detect various ways of spelling a rotation:

13769

// [11, 12, 13, 14, 15, 0, 1, 2]

13770

// [-1, 12, 13, 14, -1, -1, 1, -1]

13771

// [-1, -1, -1, -1, -1, -1, 1, 2]

13772

// [ 3, 4, 5, 6, 7, 8, 9, 10]

13773

// [-1, 4, 5, 6, -1, -1, 9, -1]

13774

// [-1, 4, 5, 6, -1, -1, -1, -1]

13775

int Rotation = 0;

13776

SDValue Lo, Hi;

13777

for (int i = 0; i < NumElts; ++i) {

13778

int M = Mask[i];

13779

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13780, __extension__
__PRETTY_FUNCTION__))

13780

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13780, __extension__
__PRETTY_FUNCTION__));

13781

if (M < 0)

13782

continue;

13783

13784

// Determine where a rotated vector would have started.

13785

int StartIdx = i - (M % NumElts);

13786

if (StartIdx == 0)

13787

// The identity rotation isn't interesting, stop.

13788

return -1;

13789

13790

// If we found the tail of a vector the rotation must be the missing

13791

// front. If we found the head of a vector, it must be how much of the

13792

// head.

13793

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

13794

13795

if (Rotation == 0)

13796

Rotation = CandidateRotation;

13797

else if (Rotation != CandidateRotation)

13798

// The rotations don't match, so we can't match this mask.

13799

return -1;

13800

13801

// Compute which value this mask is pointing at.

13802

SDValue MaskV = M < NumElts ? V1 : V2;

13803

13804

// Compute which of the two target values this index should be assigned

13805

// to. This reflects whether the high elements are remaining or the low

13806

// elements are remaining.

13807

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

13808

13809

// Either set up this value if we've not encountered it before, or check

13810

// that it remains consistent.

13811

if (!TargetV)

13812

TargetV = MaskV;

13813

else if (TargetV != MaskV)

13814

// This may be a rotation, but it pulls from the inputs in some

13815

// unsupported interleaving.

13816

return -1;

13817

}

13818

13819

// Check that we successfully analyzed the mask, and normalize the results.

13820

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13820, __extension__
__PRETTY_FUNCTION__));

13821

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13821, __extension__
__PRETTY_FUNCTION__));

13822

if (!Lo)

13823

Lo = Hi;

13824

else if (!Hi)

13825

Hi = Lo;

13826

13827

V1 = Lo;

13828

V2 = Hi;

13829

13830

return Rotation;

13831

}

13832

13833

/// Try to lower a vector shuffle as a byte rotation.

13834

///

13835

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

13836

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

13837

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

13838

/// try to generically lower a vector shuffle through such an pattern. It

13839

/// does not check for the profitability of lowering either as PALIGNR or

13840

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

13841

/// This matches shuffle vectors that look like:

13842

///

13843

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

13844

///

13845

/// Essentially it concatenates V1 and V2, shifts right by some number of

13846

/// elements, and takes the low elements as the result. Note that while this is

13847

/// specified as a *right shift* because x86 is little-endian, it is a *left

13848

/// rotate* of the vector lanes.

13849

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

13850

ArrayRef<int> Mask) {

13851

// Don't accept any shuffles with zero elements.

13852

if (isAnyZero(Mask))

13853

return -1;

13854

13855

// PALIGNR works on 128-bit lanes.

13856

SmallVector<int, 16> RepeatedMask;

13857

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

13858

return -1;

13859

13860

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

13861

if (Rotation <= 0)

13862

return -1;

13863

13864

// PALIGNR rotates bytes, so we need to scale the

13865

// rotation based on how many bytes are in the vector lane.

13866

int NumElts = RepeatedMask.size();

13867

int Scale = 16 / NumElts;

13868

return Rotation * Scale;

13869

}

13870

13871

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

13872

SDValue V2, ArrayRef<int> Mask,

13873

const X86Subtarget &Subtarget,

13874

SelectionDAG &DAG) {

13875

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13875, __extension__
__PRETTY_FUNCTION__));

13876

13877

SDValue Lo = V1, Hi = V2;

13878

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

13879

if (ByteRotation <= 0)

13880

return SDValue();

13881

13882

// Cast the inputs to i8 vector of correct length to match PALIGNR or

13883

// PSLLDQ/PSRLDQ.

13884

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13885

Lo = DAG.getBitcast(ByteVT, Lo);

13886

Hi = DAG.getBitcast(ByteVT, Hi);

13887

13888

// SSSE3 targets can use the palignr instruction.

13889

if (Subtarget.hasSSSE3()) {

13890

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13891, __extension__
__PRETTY_FUNCTION__))

13891

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13891, __extension__
__PRETTY_FUNCTION__));

13892

return DAG.getBitcast(

13893

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

13894

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

13895

}

13896

13897

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13898, __extension__
__PRETTY_FUNCTION__))

13898

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13898, __extension__
__PRETTY_FUNCTION__));

13899

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13900, __extension__
__PRETTY_FUNCTION__))

13900

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13900, __extension__
__PRETTY_FUNCTION__));

13901

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13902, __extension__
__PRETTY_FUNCTION__))

13902

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13902, __extension__
__PRETTY_FUNCTION__));

13903

13904

// Default SSE2 implementation

13905

int LoByteShift = 16 - ByteRotation;

13906

int HiByteShift = ByteRotation;

13907

13908

SDValue LoShift =

13909

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

13910

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

13911

SDValue HiShift =

13912

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

13913

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

13914

return DAG.getBitcast(VT,

13915

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

13916

}

13917

13918

/// Try to lower a vector shuffle as a dword/qword rotation.

13919

///

13920

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

13921

/// rotation of the concatenation of two vectors; This routine will

13922

/// try to generically lower a vector shuffle through such an pattern.

13923

///

13924

/// Essentially it concatenates V1 and V2, shifts right by some number of

13925

/// elements, and takes the low elements as the result. Note that while this is

13926

/// specified as a *right shift* because x86 is little-endian, it is a *left

13927

/// rotate* of the vector lanes.

13928

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

13929

SDValue V2, ArrayRef<int> Mask,

13930

const X86Subtarget &Subtarget,

13931

SelectionDAG &DAG) {

13932

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13933, __extension__
__PRETTY_FUNCTION__))

13933

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13933, __extension__
__PRETTY_FUNCTION__));

13934

13935

// 128/256-bit vectors are only supported with VLX.

13936

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13937, __extension__
__PRETTY_FUNCTION__))

13937

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13937, __extension__
__PRETTY_FUNCTION__));

13938

13939

SDValue Lo = V1, Hi = V2;

13940

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

13941

if (Rotation <= 0)

13942

return SDValue();

13943

13944

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

13945

DAG.getTargetConstant(Rotation, DL, MVT::i8));

13946

}

13947

13948

/// Try to lower a vector shuffle as a byte shift sequence.

13949

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

13950

SDValue V2, ArrayRef<int> Mask,

13951

const APInt &Zeroable,

13952

const X86Subtarget &Subtarget,

13953

SelectionDAG &DAG) {

13954

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13954, __extension__
__PRETTY_FUNCTION__));

13955

assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13955, __extension__
__PRETTY_FUNCTION__));

13956

13957

// We need a shuffle that has zeros at one/both ends and a sequential

13958

// shuffle from one source within.

13959

unsigned ZeroLo = Zeroable.countr_one();

13960

unsigned ZeroHi = Zeroable.countl_one();

13961

if (!ZeroLo && !ZeroHi)

13962

return SDValue();

13963

13964

unsigned NumElts = Mask.size();

13965

unsigned Len = NumElts - (ZeroLo + ZeroHi);

13966

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

13967

return SDValue();

13968

13969

unsigned Scale = VT.getScalarSizeInBits() / 8;

13970

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

13971

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

13972

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

13973

return SDValue();

13974

13975

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

13976

Res = DAG.getBitcast(MVT::v16i8, Res);

13977

13978

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

13979

// inner sequential set of elements, possibly offset:

13980

// 01234567 --> zzzzzz01 --> 1zzzzzzz

13981

// 01234567 --> 4567zzzz --> zzzzz456

13982

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

13983

if (ZeroLo == 0) {

13984

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13985

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13986

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13987

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13988

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

13989

} else if (ZeroHi == 0) {

13990

unsigned Shift = Mask[ZeroLo] % NumElts;

13991

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13992

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13993

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13994

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

13995

} else if (!Subtarget.hasSSSE3()) {

13996

// If we don't have PSHUFB then its worth avoiding an AND constant mask

13997

// by performing 3 byte shifts. Shuffle combining can kick in above that.

13998

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

13999

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

14000

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14001

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14002

Shift += Mask[ZeroLo] % NumElts;

14003

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14004

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14005

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14006

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

14007

} else

14008

return SDValue();

14009

14010

return DAG.getBitcast(VT, Res);

14011

}

14012

14013

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

14014

///

14015

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

14016

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

14017

/// matches elements from one of the input vectors shuffled to the left or

14018

/// right with zeroable elements 'shifted in'. It handles both the strictly

14019

/// bit-wise element shifts and the byte shift across an entire 128-bit double

14020

/// quad word lane.

14021

///

14022

/// PSHL : (little-endian) left bit shift.

14023

/// [ zz, 0, zz, 2 ]

14024

/// [ -1, 4, zz, -1 ]

14025

/// PSRL : (little-endian) right bit shift.

14026

/// [ 1, zz, 3, zz]

14027

/// [ -1, -1, 7, zz]

14028

/// PSLLDQ : (little-endian) left byte shift

14029

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

14030

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

14031

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

14032

/// PSRLDQ : (little-endian) right byte shift

14033

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

14034

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

14035

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

14036

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

14037

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

14038

int MaskOffset, const APInt &Zeroable,

14039

const X86Subtarget &Subtarget) {

14040

int Size = Mask.size();

14041

unsigned SizeInBits = Size * ScalarSizeInBits;

14042

14043

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

14044

for (int i = 0; i < Size; i += Scale)

14045

for (int j = 0; j < Shift; ++j)

14046

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

14047

return false;

14048

14049

return true;

14050

};

14051

14052

auto MatchShift = [&](int Shift, int Scale, bool Left) {

14053

for (int i = 0; i != Size; i += Scale) {

14054

unsigned Pos = Left ? i + Shift : i;

14055

unsigned Low = Left ? i : i + Shift;

14056

unsigned Len = Scale - Shift;

14057

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

14058

return -1;

14059

}

14060

14061

int ShiftEltBits = ScalarSizeInBits * Scale;

14062

bool ByteShift = ShiftEltBits > 64;

14063

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

14064

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

14065

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

14066

14067

// Normalize the scale for byte shifts to still produce an i64 element

14068

// type.

14069

Scale = ByteShift ? Scale / 2 : Scale;

14070

14071

// We need to round trip through the appropriate type for the shift.

14072

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

14073

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

14074

: MVT::getVectorVT(ShiftSVT, Size / Scale);

14075

return (int)ShiftAmt;

14076

};

14077

14078

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

14079

// keep doubling the size of the integer elements up to that. We can

14080

// then shift the elements of the integer vector by whole multiples of

14081

// their width within the elements of the larger integer vector. Test each

14082

// multiple to see if we can find a match with the moved element indices

14083

// and that the shifted in elements are all zeroable.

14084

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

14085

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

14086

for (int Shift = 1; Shift != Scale; ++Shift)

14087

for (bool Left : {true, false})

14088

if (CheckZeros(Shift, Scale, Left)) {

14089

int ShiftAmt = MatchShift(Shift, Scale, Left);

14090

if (0 < ShiftAmt)

14091

return ShiftAmt;

14092

}

14093

14094

// no match

14095

return -1;

14096

}

14097

14098

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

14099

SDValue V2, ArrayRef<int> Mask,

14100

const APInt &Zeroable,

14101

const X86Subtarget &Subtarget,

14102

SelectionDAG &DAG) {

14103

int Size = Mask.size();

14104

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14104, __extension__
__PRETTY_FUNCTION__));

14105

14106

MVT ShiftVT;

14107

SDValue V = V1;

14108

unsigned Opcode;

14109

14110

// Try to match shuffle against V1 shift.

14111

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14112

Mask, 0, Zeroable, Subtarget);

14113

14114

// If V1 failed, try to match shuffle against V2 shift.

14115

if (ShiftAmt < 0) {

14116

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14117

Mask, Size, Zeroable, Subtarget);

14118

V = V2;

14119

}

14120

14121

if (ShiftAmt < 0)

14122

return SDValue();

14123

14124

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))

14125

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__));

14126

V = DAG.getBitcast(ShiftVT, V);

14127

V = DAG.getNode(Opcode, DL, ShiftVT, V,

14128

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

14129

return DAG.getBitcast(VT, V);

14130

}

14131

14132

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

14133

// Remainder of lower half result is zero and upper half is all undef.

14134

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

14135

ArrayRef<int> Mask, uint64_t &BitLen,

14136

uint64_t &BitIdx, const APInt &Zeroable) {

14137

int Size = Mask.size();

14138

int HalfSize = Size / 2;

14139

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14139, __extension__
__PRETTY_FUNCTION__));

14140

assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14140, __extension__
__PRETTY_FUNCTION__));

14141

14142

// Upper half must be undefined.

14143

if (!isUndefUpperHalf(Mask))

14144

return false;

14145

14146

// Determine the extraction length from the part of the

14147

// lower half that isn't zeroable.

14148

int Len = HalfSize;

14149

for (; Len > 0; --Len)

14150

if (!Zeroable[Len - 1])

14151

break;

14152

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14152, __extension__
__PRETTY_FUNCTION__));

14153

14154

// Attempt to match first Len sequential elements from the lower half.

14155

SDValue Src;

14156

int Idx = -1;

14157

for (int i = 0; i != Len; ++i) {

14158

int M = Mask[i];

14159

if (M == SM_SentinelUndef)

14160

continue;

14161

SDValue &V = (M < Size ? V1 : V2);

14162

M = M % Size;

14163

14164

// The extracted elements must start at a valid index and all mask

14165

// elements must be in the lower half.

14166

if (i > M || M >= HalfSize)

14167

return false;

14168

14169

if (Idx < 0 || (Src == V && Idx == (M - i))) {

14170

Src = V;

14171

Idx = M - i;

14172

continue;

14173

}

14174

return false;

14175

}

14176

14177

if (!Src || Idx < 0)

14178

return false;

14179

14180

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__));

14181

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14182

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14183

V1 = Src;

14184

return true;

14185

}

14186

14187

// INSERTQ: Extract lowest Len elements from lower half of second source and

14188

// insert over first source, starting at Idx.

14189

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

14190

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

14191

ArrayRef<int> Mask, uint64_t &BitLen,

14192

uint64_t &BitIdx) {

14193

int Size = Mask.size();

14194

int HalfSize = Size / 2;

14195

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14195, __extension__
__PRETTY_FUNCTION__));

14196

14197

// Upper half must be undefined.

14198

if (!isUndefUpperHalf(Mask))

14199

return false;

14200

14201

for (int Idx = 0; Idx != HalfSize; ++Idx) {

14202

SDValue Base;

14203

14204

// Attempt to match first source from mask before insertion point.

14205

if (isUndefInRange(Mask, 0, Idx)) {

14206

/* EMPTY */

14207

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

14208

Base = V1;

14209

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

14210

Base = V2;

14211

} else {

14212

continue;

14213

}

14214

14215

// Extend the extraction length looking to match both the insertion of

14216

// the second source and the remaining elements of the first.

14217

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

14218

SDValue Insert;

14219

int Len = Hi - Idx;

14220

14221

// Match insertion.

14222

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

14223

Insert = V1;

14224

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

14225

Insert = V2;

14226

} else {

14227

continue;

14228

}

14229

14230

// Match the remaining elements of the lower half.

14231

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

14232

/* EMPTY */

14233

} else if ((!Base || (Base == V1)) &&

14234

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

14235

Base = V1;

14236

} else if ((!Base || (Base == V2)) &&

14237

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

14238

Size + Hi)) {

14239

Base = V2;

14240

} else {

14241

continue;

14242

}

14243

14244

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14245

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14246

V1 = Base;

14247

V2 = Insert;

14248

return true;

14249

}

14250

}

14251

14252

return false;

14253

}

14254

14255

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

14256

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

14257

SDValue V2, ArrayRef<int> Mask,

14258

const APInt &Zeroable, SelectionDAG &DAG) {

14259

uint64_t BitLen, BitIdx;

14260

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

14261

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

14262

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14263

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14264

14265

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

14266

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

14267

V2 ? V2 : DAG.getUNDEF(VT),

14268

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14269

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14270

14271

return SDValue();

14272

}

14273

14274

/// Lower a vector shuffle as a zero or any extension.

14275

///

14276

/// Given a specific number of elements, element bit width, and extension

14277

/// stride, produce either a zero or any extension based on the available

14278

/// features of the subtarget. The extended elements are consecutive and

14279

/// begin and can start from an offsetted element index in the input; to

14280

/// avoid excess shuffling the offset must either being in the bottom lane

14281

/// or at the start of a higher lane. All extended elements must be from

14282

/// the same lane.

14283

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

14284

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

14285

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14286

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14286, __extension__
__PRETTY_FUNCTION__));

14287

int EltBits = VT.getScalarSizeInBits();

14288

int NumElements = VT.getVectorNumElements();

14289

int NumEltsPerLane = 128 / EltBits;

14290

int OffsetLane = Offset / NumEltsPerLane;

14291

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14292, __extension__
__PRETTY_FUNCTION__))

14292

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14292, __extension__
__PRETTY_FUNCTION__));

14293

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14293, __extension__
__PRETTY_FUNCTION__));

14294

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14294, __extension__
__PRETTY_FUNCTION__));

14295

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14296, __extension__
__PRETTY_FUNCTION__))

14296

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14296, __extension__
__PRETTY_FUNCTION__));

14297

14298

// Check that an index is in same lane as the base offset.

14299

auto SafeOffset = [&](int Idx) {

14300

return OffsetLane == (Idx / NumEltsPerLane);

14301

};

14302

14303

// Shift along an input so that the offset base moves to the first element.

14304

auto ShuffleOffset = [&](SDValue V) {

14305

if (!Offset)

14306

return V;

14307

14308

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14309

for (int i = 0; i * Scale < NumElements; ++i) {

14310

int SrcIdx = i + Offset;

14311

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

14312

}

14313

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

14314

};

14315

14316

// Found a valid a/zext mask! Try various lowering strategies based on the

14317

// input type and available ISA extensions.

14318

if (Subtarget.hasSSE41()) {

14319

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

14320

// PUNPCK will catch this in a later shuffle match.

14321

if (Offset && Scale == 2 && VT.is128BitVector())

14322

return SDValue();

14323

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

14324

NumElements / Scale);

14325

InputV = ShuffleOffset(InputV);

14326

InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,

14327

DL, ExtVT, InputV, DAG);

14328

return DAG.getBitcast(VT, InputV);

14329

}

14330

14331

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14331, __extension__
__PRETTY_FUNCTION__));

14332

14333

// For any extends we can cheat for larger element sizes and use shuffle

14334

// instructions that can fold with a load and/or copy.

14335

if (AnyExt && EltBits == 32) {

14336

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

14337

-1};

14338

return DAG.getBitcast(

14339

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14340

DAG.getBitcast(MVT::v4i32, InputV),

14341

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14342

}

14343

if (AnyExt && EltBits == 16 && Scale > 2) {

14344

int PSHUFDMask[4] = {Offset / 2, -1,

14345

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

14346

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14347

DAG.getBitcast(MVT::v4i32, InputV),

14348

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

14349

int PSHUFWMask[4] = {1, -1, -1, -1};

14350

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

14351

return DAG.getBitcast(

14352

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

14353

DAG.getBitcast(MVT::v8i16, InputV),

14354

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

14355

}

14356

14357

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

14358

// to 64-bits.

14359

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

14360

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14360, __extension__
__PRETTY_FUNCTION__));

14361

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14361, __extension__
__PRETTY_FUNCTION__));

14362

14363

int LoIdx = Offset * EltBits;

14364

SDValue Lo = DAG.getBitcast(

14365

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14366

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14367

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

14368

14369

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

14370

return DAG.getBitcast(VT, Lo);

14371

14372

int HiIdx = (Offset + 1) * EltBits;

14373

SDValue Hi = DAG.getBitcast(

14374

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14375

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14376

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

14377

return DAG.getBitcast(VT,

14378

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

14379

}

14380

14381

// If this would require more than 2 unpack instructions to expand, use

14382

// pshufb when available. We can only use more than 2 unpack instructions

14383

// when zero extending i8 elements which also makes it easier to use pshufb.

14384

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

14385

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14385, __extension__
__PRETTY_FUNCTION__));

14386

SDValue PSHUFBMask[16];

14387

for (int i = 0; i < 16; ++i) {

14388

int Idx = Offset + (i / Scale);

14389

if ((i % Scale == 0 && SafeOffset(Idx))) {

14390

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

14391

continue;

14392

}

14393

PSHUFBMask[i] =

14394

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

14395

}

14396

InputV = DAG.getBitcast(MVT::v16i8, InputV);

14397

return DAG.getBitcast(

14398

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

14399

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

14400

}

14401

14402

// If we are extending from an offset, ensure we start on a boundary that

14403

// we can unpack from.

14404

int AlignToUnpack = Offset % (NumElements / Scale);

14405

if (AlignToUnpack) {

14406

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14407

for (int i = AlignToUnpack; i < NumElements; ++i)

14408

ShMask[i - AlignToUnpack] = i;

14409

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

14410

Offset -= AlignToUnpack;

14411

}

14412

14413

// Otherwise emit a sequence of unpacks.

14414

do {

14415

unsigned UnpackLoHi = X86ISD::UNPCKL;

14416

if (Offset >= (NumElements / 2)) {

14417

UnpackLoHi = X86ISD::UNPCKH;

14418

Offset -= (NumElements / 2);

14419

}

14420

14421

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

14422

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

14423

: getZeroVector(InputVT, Subtarget, DAG, DL);

14424

InputV = DAG.getBitcast(InputVT, InputV);

14425

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

14426

Scale /= 2;

14427

EltBits *= 2;

14428

NumElements /= 2;

14429

} while (Scale > 1);

14430

return DAG.getBitcast(VT, InputV);

14431

}

14432

14433

/// Try to lower a vector shuffle as a zero extension on any microarch.

14434

///

14435

/// This routine will try to do everything in its power to cleverly lower

14436

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

14437

/// check for the profitability of this lowering, it tries to aggressively

14438

/// match this pattern. It will use all of the micro-architectural details it

14439

/// can to emit an efficient lowering. It handles both blends with all-zero

14440

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

14441

/// masking out later).

14442

///

14443

/// The reason we have dedicated lowering for zext-style shuffles is that they

14444

/// are both incredibly common and often quite performance sensitive.

14445

static SDValue lowerShuffleAsZeroOrAnyExtend(

14446

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14447

const APInt &Zeroable, const X86Subtarget &Subtarget,

14448

SelectionDAG &DAG) {

14449

int Bits = VT.getSizeInBits();

14450

int NumLanes = Bits / 128;

14451

int NumElements = VT.getVectorNumElements();

14452

int NumEltsPerLane = NumElements / NumLanes;

14453

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14454, __extension__
__PRETTY_FUNCTION__))

14454

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14454, __extension__
__PRETTY_FUNCTION__));

14455

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14455, __extension__
__PRETTY_FUNCTION__));

14456

14457

// Define a helper function to check a particular ext-scale and lower to it if

14458

// valid.

14459

auto Lower = [&](int Scale) -> SDValue {

14460

SDValue InputV;

14461

bool AnyExt = true;

14462

int Offset = 0;

14463

int Matches = 0;

14464

for (int i = 0; i < NumElements; ++i) {

14465

int M = Mask[i];

14466

if (M < 0)

14467

continue; // Valid anywhere but doesn't tell us anything.

14468

if (i % Scale != 0) {

14469

// Each of the extended elements need to be zeroable.

14470

if (!Zeroable[i])

14471

return SDValue();

14472

14473

// We no longer are in the anyext case.

14474

AnyExt = false;

14475

continue;

14476

}

14477

14478

// Each of the base elements needs to be consecutive indices into the

14479

// same input vector.

14480

SDValue V = M < NumElements ? V1 : V2;

14481

M = M % NumElements;

14482

if (!InputV) {

14483

InputV = V;

14484

Offset = M - (i / Scale);

14485

} else if (InputV != V)

14486

return SDValue(); // Flip-flopping inputs.

14487

14488

// Offset must start in the lowest 128-bit lane or at the start of an

14489

// upper lane.

14490

// FIXME: Is it ever worth allowing a negative base offset?

14491

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

14492

(Offset % NumEltsPerLane) == 0))

14493

return SDValue();

14494

14495

// If we are offsetting, all referenced entries must come from the same

14496

// lane.

14497

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

14498

return SDValue();

14499

14500

if ((M % NumElements) != (Offset + (i / Scale)))

14501

return SDValue(); // Non-consecutive strided elements.

14502

Matches++;

14503

}

14504

14505

// If we fail to find an input, we have a zero-shuffle which should always

14506

// have already been handled.

14507

// FIXME: Maybe handle this here in case during blending we end up with one?

14508

if (!InputV)

14509

return SDValue();

14510

14511

// If we are offsetting, don't extend if we only match a single input, we

14512

// can always do better by using a basic PSHUF or PUNPCK.

14513

if (Offset != 0 && Matches < 2)

14514

return SDValue();

14515

14516

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

14517

InputV, Mask, Subtarget, DAG);

14518

};

14519

14520

// The widest scale possible for extending is to a 64-bit integer.

14521

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14522, __extension__
__PRETTY_FUNCTION__))

14522

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14522, __extension__
__PRETTY_FUNCTION__));

14523

int NumExtElements = Bits / 64;

14524

14525

// Each iteration, try extending the elements half as much, but into twice as

14526

// many elements.

14527

for (; NumExtElements < NumElements; NumExtElements *= 2) {

14528

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14529, __extension__
__PRETTY_FUNCTION__))

14529

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14529, __extension__
__PRETTY_FUNCTION__));

14530

if (SDValue V = Lower(NumElements / NumExtElements))

14531

return V;

14532

}

14533

14534

// General extends failed, but 128-bit vectors may be able to use MOVQ.

14535

if (Bits != 128)

14536

return SDValue();

14537

14538

// Returns one of the source operands if the shuffle can be reduced to a

14539

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

14540

auto CanZExtLowHalf = [&]() {

14541

for (int i = NumElements / 2; i != NumElements; ++i)

14542

if (!Zeroable[i])

14543

return SDValue();

14544

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

14545

return V1;

14546

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

14547

return V2;

14548

return SDValue();

14549

};

14550

14551

if (SDValue V = CanZExtLowHalf()) {

14552

V = DAG.getBitcast(MVT::v2i64, V);

14553

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

14554

return DAG.getBitcast(VT, V);

14555

}

14556

14557

// No viable ext lowering found.

14558

return SDValue();

14559

}

14560

14561

/// Try to get a scalar value for a specific element of a vector.

14562

///

14563

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

14564

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

14565

SelectionDAG &DAG) {

14566

MVT VT = V.getSimpleValueType();

14567

MVT EltVT = VT.getVectorElementType();

14568

V = peekThroughBitcasts(V);

14569

14570

// If the bitcasts shift the element size, we can't extract an equivalent

14571

// element from it.

14572

MVT NewVT = V.getSimpleValueType();

14573

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

14574

return SDValue();

14575

14576

if (V.getOpcode() == ISD::BUILD_VECTOR ||

14577

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

14578

// Ensure the scalar operand is the same size as the destination.

14579

// FIXME: Add support for scalar truncation where possible.

14580

SDValue S = V.getOperand(Idx);

14581

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

14582

return DAG.getBitcast(EltVT, S);

14583

}

14584

14585

return SDValue();

14586

}

14587

14588

/// Helper to test for a load that can be folded with x86 shuffles.

14589

///

14590

/// This is particularly important because the set of instructions varies

14591

/// significantly based on whether the operand is a load or not.

14592

static bool isShuffleFoldableLoad(SDValue V) {

14593

return V->hasOneUse() &&

14594

ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());

14595

}

14596

14597

template<typename T>

14598

static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {

14599

return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();

14600

}

14601

14602

template<typename T>

14603

bool X86TargetLowering::isSoftFP16(T VT) const {

14604

return ::isSoftFP16(VT, Subtarget);

14605

}

14606

14607

/// Try to lower insertion of a single element into a zero vector.

14608

///

14609

/// This is a common pattern that we have especially efficient patterns to lower

14610

/// across all subtarget feature sets.

14611

static SDValue lowerShuffleAsElementInsertion(

14612

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14613

const APInt &Zeroable, const X86Subtarget &Subtarget,

14614

SelectionDAG &DAG) {

14615

MVT ExtVT = VT;

14616

MVT EltVT = VT.getVectorElementType();

14617

14618

if (isSoftFP16(EltVT, Subtarget))

14619

return SDValue();

14620

14621

int V2Index =

14622

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

14623

Mask.begin();

14624

bool IsV1Zeroable = true;

14625

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14626

if (i != V2Index && !Zeroable[i]) {

14627

IsV1Zeroable = false;

14628

break;

14629

}

14630

14631

// Check for a single input from a SCALAR_TO_VECTOR node.

14632

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

14633

// all the smarts here sunk into that routine. However, the current

14634

// lowering of BUILD_VECTOR makes that nearly impossible until the old

14635

// vector shuffle lowering is dead.

14636

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

14637

DAG);

14638

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

14639

// We need to zext the scalar if it is smaller than an i32.

14640

V2S = DAG.getBitcast(EltVT, V2S);

14641

if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

14642

// Using zext to expand a narrow element won't work for non-zero

14643

// insertions.

14644

if (!IsV1Zeroable)

14645

return SDValue();

14646

14647

// Zero-extend directly to i32.

14648

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

14649

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

14650

}

14651

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14652

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

14653

EltVT == MVT::i16) {

14654

// Either not inserting from the low element of the input or the input

14655

// element size is too small to use VZEXT_MOVL to clear the high bits.

14656

return SDValue();

14657

}

14658

14659

if (!IsV1Zeroable) {

14660

// If V1 can't be treated as a zero vector we have fewer options to lower

14661

// this. We can't support integer vectors or non-zero targets cheaply, and

14662

// the V1 elements can't be permuted in any way.

14663

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14663, __extension__
__PRETTY_FUNCTION__));

14664

if (!VT.isFloatingPoint() || V2Index != 0)

14665

return SDValue();

14666

SmallVector<int, 8> V1Mask(Mask);

14667

V1Mask[V2Index] = -1;

14668

if (!isNoopShuffleMask(V1Mask))

14669

return SDValue();

14670

if (!VT.is128BitVector())

14671

return SDValue();

14672

14673

// Otherwise, use MOVSD, MOVSS or MOVSH.

14674

unsigned MovOpc = 0;

14675

if (EltVT == MVT::f16)

14676

MovOpc = X86ISD::MOVSH;

14677

else if (EltVT == MVT::f32)

14678

MovOpc = X86ISD::MOVSS;

14679

else if (EltVT == MVT::f64)

14680

MovOpc = X86ISD::MOVSD;

14681

else

14682

llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14682);

14683

return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

14684

}

14685

14686

// This lowering only works for the low element with floating point vectors.

14687

if (VT.isFloatingPoint() && V2Index != 0)

14688

return SDValue();

14689

14690

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

14691

if (ExtVT != VT)

14692

V2 = DAG.getBitcast(VT, V2);

14693

14694

if (V2Index != 0) {

14695

// If we have 4 or fewer lanes we can cheaply shuffle the element into

14696

// the desired position. Otherwise it is more efficient to do a vector

14697

// shift left. We know that we can do a vector shift left because all

14698

// the inputs are zero.

14699

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

14700

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

14701

V2Shuffle[V2Index] = 0;

14702

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

14703

} else {

14704

V2 = DAG.getBitcast(MVT::v16i8, V2);

14705

V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

14706

DAG.getTargetConstant(

14707

V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));

14708

V2 = DAG.getBitcast(VT, V2);

14709

}

14710

}

14711

return V2;

14712

}

14713

14714

/// Try to lower broadcast of a single - truncated - integer element,

14715

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

14716

///

14717

/// This assumes we have AVX2.

14718

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

14719

int BroadcastIdx,

14720

const X86Subtarget &Subtarget,

14721

SelectionDAG &DAG) {

14722

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14723, __extension__
__PRETTY_FUNCTION__))

14723

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14723, __extension__
__PRETTY_FUNCTION__));

14724

14725

MVT EltVT = VT.getVectorElementType();

14726

MVT V0VT = V0.getSimpleValueType();

14727

14728

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14728, __extension__
__PRETTY_FUNCTION__));

14729

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14729, __extension__
__PRETTY_FUNCTION__));

14730

14731

MVT V0EltVT = V0VT.getVectorElementType();

14732

if (!V0EltVT.isInteger())

14733

return SDValue();

14734

14735

const unsigned EltSize = EltVT.getSizeInBits();

14736

const unsigned V0EltSize = V0EltVT.getSizeInBits();

14737

14738

// This is only a truncation if the original element type is larger.

14739

if (V0EltSize <= EltSize)

14740

return SDValue();

14741

14742

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14743, __extension__
__PRETTY_FUNCTION__))

14743

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14743, __extension__
__PRETTY_FUNCTION__));

14744

14745

const unsigned V0Opc = V0.getOpcode();

14746

const unsigned Scale = V0EltSize / EltSize;

14747

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

14748

14749

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

14750

V0Opc != ISD::BUILD_VECTOR)

14751

return SDValue();

14752

14753

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

14754

14755

// If we're extracting non-least-significant bits, shift so we can truncate.

14756

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

14757

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

14758

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

14759

if (const int OffsetIdx = BroadcastIdx % Scale)

14760

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

14761

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

14762

14763

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

14764

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

14765

}

14766

14767

/// Test whether this can be lowered with a single SHUFPS instruction.

14768

///

14769

/// This is used to disable more specialized lowerings when the shufps lowering

14770

/// will happen to be efficient.

14771

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

14772

// This routine only handles 128-bit shufps.

14773

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14773, __extension__
__PRETTY_FUNCTION__));

14774

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14774, __extension__
__PRETTY_FUNCTION__));

14775

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14775, __extension__
__PRETTY_FUNCTION__));

14776

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__));

14777

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14777, __extension__
__PRETTY_FUNCTION__));

14778

14779

// To lower with a single SHUFPS we need to have the low half and high half

14780

// each requiring a single input.

14781

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

14782

return false;

14783

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

14784

return false;

14785

14786

return true;

14787

}

14788

14789

/// Test whether the specified input (0 or 1) is in-place blended by the

14790

/// given mask.

14791

///

14792

/// This returns true if the elements from a particular input are already in the

14793

/// slot required by the given mask and require no permutation.

14794

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

14795

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14795, __extension__
__PRETTY_FUNCTION__));

14796

int Size = Mask.size();

14797

for (int i = 0; i < Size; ++i)

14798

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

14799

return false;

14800

14801

return true;

14802

}

14803

14804

/// If we are extracting two 128-bit halves of a vector and shuffling the

14805

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

14806

/// multi-shuffle lowering.

14807

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

14808

SDValue N1, ArrayRef<int> Mask,

14809

SelectionDAG &DAG) {

14810

MVT VT = N0.getSimpleValueType();

14811

assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14813, __extension__
__PRETTY_FUNCTION__))

14812

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14813, __extension__
__PRETTY_FUNCTION__))

14813

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14813, __extension__
__PRETTY_FUNCTION__));

14814

14815

// Check that both sources are extracts of the same source vector.

14816

if (!N0.hasOneUse() || !N1.hasOneUse() ||

14817

N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14818

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14819

N0.getOperand(0) != N1.getOperand(0))

14820

return SDValue();

14821

14822

SDValue WideVec = N0.getOperand(0);

14823

MVT WideVT = WideVec.getSimpleValueType();

14824

if (!WideVT.is256BitVector())

14825

return SDValue();

14826

14827

// Match extracts of each half of the wide source vector. Commute the shuffle

14828

// if the extract of the low half is N1.

14829

unsigned NumElts = VT.getVectorNumElements();

14830

SmallVector<int, 4> NewMask(Mask);

14831

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

14832

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

14833

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

14834

ShuffleVectorSDNode::commuteMask(NewMask);

14835

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

14836

return SDValue();

14837

14838

// Final bailout: if the mask is simple, we are better off using an extract

14839

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

14840

// because that avoids a constant load from memory.

14841

if (NumElts == 4 &&

14842

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))

14843

return SDValue();

14844

14845

// Extend the shuffle mask with undef elements.

14846

NewMask.append(NumElts, -1);

14847

14848

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

14849

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

14850

NewMask);

14851

// This is free: ymm -> xmm.

14852

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

14853

DAG.getIntPtrConstant(0, DL));

14854

}

14855

14856

/// Try to lower broadcast of a single element.

14857

///

14858

/// For convenience, this code also bundles all of the subtarget feature set

14859

/// filtering. While a little annoying to re-dispatch on type here, there isn't

14860

/// a convenient way to factor it out.

14861

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

14862

SDValue V2, ArrayRef<int> Mask,

14863

const X86Subtarget &Subtarget,

14864

SelectionDAG &DAG) {

14865

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

14866

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

14867

(Subtarget.hasAVX2() && VT.isInteger())))

14868

return SDValue();

14869

14870

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

14871

// we can only broadcast from a register with AVX2.

14872

unsigned NumEltBits = VT.getScalarSizeInBits();

14873

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

14874

? X86ISD::MOVDDUP

14875

: X86ISD::VBROADCAST;

14876

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

14877

14878

// Check that the mask is a broadcast.

14879

int BroadcastIdx = getSplatIndex(Mask);

14880

if (BroadcastIdx < 0)

14881

return SDValue();

14882

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14884, __extension__
__PRETTY_FUNCTION__))

14883

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14884, __extension__
__PRETTY_FUNCTION__))

14884

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14884, __extension__
__PRETTY_FUNCTION__));

14885

14886

// Go up the chain of (vector) values to find a scalar load that we can

14887

// combine with the broadcast.

14888

// TODO: Combine this logic with findEltLoadSrc() used by

14889

// EltsFromConsecutiveLoads().

14890

int BitOffset = BroadcastIdx * NumEltBits;

14891

SDValue V = V1;

14892

for (;;) {

14893

switch (V.getOpcode()) {

14894

case ISD::BITCAST: {

14895

V = V.getOperand(0);

14896

continue;

14897

}

14898

case ISD::CONCAT_VECTORS: {

14899

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

14900

int OpIdx = BitOffset / OpBitWidth;

14901

V = V.getOperand(OpIdx);

14902

BitOffset %= OpBitWidth;

14903

continue;

14904

}

14905

case ISD::EXTRACT_SUBVECTOR: {

14906

// The extraction index adds to the existing offset.

14907

unsigned EltBitWidth = V.getScalarValueSizeInBits();

14908

unsigned Idx = V.getConstantOperandVal(1);

14909

unsigned BeginOffset = Idx * EltBitWidth;

14910

BitOffset += BeginOffset;

14911

V = V.getOperand(0);

14912

continue;

14913

}

14914

case ISD::INSERT_SUBVECTOR: {

14915

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

14916

int EltBitWidth = VOuter.getScalarValueSizeInBits();

14917

int Idx = (int)V.getConstantOperandVal(2);

14918

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

14919

int BeginOffset = Idx * EltBitWidth;

14920

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

14921

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

14922

BitOffset -= BeginOffset;

14923

V = VInner;

14924

} else {

14925

V = VOuter;

14926

}

14927

continue;

14928

}

14929

}

14930

break;

14931

}

14932

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14932, __extension__
__PRETTY_FUNCTION__));

14933

BroadcastIdx = BitOffset / NumEltBits;

14934

14935

// Do we need to bitcast the source to retrieve the original broadcast index?

14936

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

14937

14938

// Check if this is a broadcast of a scalar. We special case lowering

14939

// for scalars so that we can more effectively fold with loads.

14940

// If the original value has a larger element type than the shuffle, the

14941

// broadcast element is in essence truncated. Make that explicit to ease

14942

// folding.

14943

if (BitCastSrc && VT.isInteger())

14944

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

14945

DL, VT, V, BroadcastIdx, Subtarget, DAG))

14946

return TruncBroadcast;

14947

14948

// Also check the simpler case, where we can directly reuse the scalar.

14949

if (!BitCastSrc &&

14950

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

14951

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

14952

V = V.getOperand(BroadcastIdx);

14953

14954

// If we can't broadcast from a register, check that the input is a load.

14955

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

14956

return SDValue();

14957

} else if (ISD::isNormalLoad(V.getNode()) &&

14958

cast<LoadSDNode>(V)->isSimple()) {

14959

// We do not check for one-use of the vector load because a broadcast load

14960

// is expected to be a win for code size, register pressure, and possibly

14961

// uops even if the original vector load is not eliminated.

14962

14963

// Reduce the vector load and shuffle to a broadcasted scalar load.

14964

LoadSDNode *Ld = cast<LoadSDNode>(V);

14965

SDValue BaseAddr = Ld->getOperand(1);

14966

MVT SVT = VT.getScalarType();

14967

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

14968

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14968, __extension__
__PRETTY_FUNCTION__));

14969

SDValue NewAddr =

14970

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

14971

14972

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

14973

// than MOVDDUP.

14974

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

14975

if (Opcode == X86ISD::VBROADCAST) {

14976

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

14977

SDValue Ops[] = {Ld->getChain(), NewAddr};

14978

V = DAG.getMemIntrinsicNode(

14979

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

14980

DAG.getMachineFunction().getMachineMemOperand(

14981

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14982

DAG.makeEquivalentMemoryOrdering(Ld, V);

14983

return DAG.getBitcast(VT, V);

14984

}

14985

assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14985, __extension__
__PRETTY_FUNCTION__));

14986

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

14987

DAG.getMachineFunction().getMachineMemOperand(

14988

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14989

DAG.makeEquivalentMemoryOrdering(Ld, V);

14990

} else if (!BroadcastFromReg) {

14991

// We can't broadcast from a vector register.

14992

return SDValue();

14993

} else if (BitOffset != 0) {

14994

// We can only broadcast from the zero-element of a vector register,

14995

// but it can be advantageous to broadcast from the zero-element of a

14996

// subvector.

14997

if (!VT.is256BitVector() && !VT.is512BitVector())

14998

return SDValue();

14999

15000

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

15001

if (VT == MVT::v4f64 || VT == MVT::v4i64)

15002

return SDValue();

15003

15004

// Only broadcast the zero-element of a 128-bit subvector.

15005

if ((BitOffset % 128) != 0)

15006

return SDValue();

15007

15008

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15009, __extension__
__PRETTY_FUNCTION__))

15009

"Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15009, __extension__
__PRETTY_FUNCTION__));

15010

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15011, __extension__
__PRETTY_FUNCTION__))

15011

"Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15011, __extension__
__PRETTY_FUNCTION__));

15012

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

15013

V = extract128BitVector(V, ExtractIdx, DAG, DL);

15014

}

15015

15016

// On AVX we can use VBROADCAST directly for scalar sources.

15017

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

15018

V = DAG.getBitcast(MVT::f64, V);

15019

if (Subtarget.hasAVX()) {

15020

V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

15021

return DAG.getBitcast(VT, V);

15022

}

15023

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

15024

}

15025

15026

// If this is a scalar, do the broadcast on this type and bitcast.

15027

if (!V.getValueType().isVector()) {

15028

assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15029, __extension__
__PRETTY_FUNCTION__))

15029

"Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15029, __extension__
__PRETTY_FUNCTION__));

15030

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

15031

VT.getVectorNumElements());

15032

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

15033

}

15034

15035

// We only support broadcasting from 128-bit vectors to minimize the

15036

// number of patterns we need to deal with in isel. So extract down to

15037

// 128-bits, removing as many bitcasts as possible.

15038

if (V.getValueSizeInBits() > 128)

15039

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

15040

15041

// Otherwise cast V to a vector with the same element type as VT, but

15042

// possibly narrower than VT. Then perform the broadcast.

15043

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

15044

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

15045

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

15046

}

15047

15048

// Check for whether we can use INSERTPS to perform the shuffle. We only use

15049

// INSERTPS when the V1 elements are already in the correct locations

15050

// because otherwise we can just always use two SHUFPS instructions which

15051

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

15052

// perform INSERTPS if a single V1 element is out of place and all V2

15053

// elements are zeroable.

15054

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

15055

unsigned &InsertPSMask,

15056

const APInt &Zeroable,

15057

ArrayRef<int> Mask, SelectionDAG &DAG) {

15058

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15058, __extension__
__PRETTY_FUNCTION__));

15059

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15059, __extension__
__PRETTY_FUNCTION__));

15060

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15060, __extension__
__PRETTY_FUNCTION__));

15061

15062

// Attempt to match INSERTPS with one element from VA or VB being

15063

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

15064

// are updated.

15065

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

15066

ArrayRef<int> CandidateMask) {

15067

unsigned ZMask = 0;

15068

int VADstIndex = -1;

15069

int VBDstIndex = -1;

15070

bool VAUsedInPlace = false;

15071

15072

for (int i = 0; i < 4; ++i) {

15073

// Synthesize a zero mask from the zeroable elements (includes undefs).

15074

if (Zeroable[i]) {

15075

ZMask |= 1 << i;

15076

continue;

15077

}

15078

15079

// Flag if we use any VA inputs in place.

15080

if (i == CandidateMask[i]) {

15081

VAUsedInPlace = true;

15082

continue;

15083

}

15084

15085

// We can only insert a single non-zeroable element.

15086

if (VADstIndex >= 0 || VBDstIndex >= 0)

15087

return false;

15088

15089

if (CandidateMask[i] < 4) {

15090

// VA input out of place for insertion.

15091

VADstIndex = i;

15092

} else {

15093

// VB input for insertion.

15094

VBDstIndex = i;

15095

}

15096

}

15097

15098

// Don't bother if we have no (non-zeroable) element for insertion.

15099

if (VADstIndex < 0 && VBDstIndex < 0)

15100

return false;

15101

15102

// Determine element insertion src/dst indices. The src index is from the

15103

// start of the inserted vector, not the start of the concatenated vector.

15104

unsigned VBSrcIndex = 0;

15105

if (VADstIndex >= 0) {

15106

// If we have a VA input out of place, we use VA as the V2 element

15107

// insertion and don't use the original V2 at all.

15108

VBSrcIndex = CandidateMask[VADstIndex];

15109

VBDstIndex = VADstIndex;

15110

VB = VA;

15111

} else {

15112

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

15113

}

15114

15115

// If no V1 inputs are used in place, then the result is created only from

15116

// the zero mask and the V2 insertion - so remove V1 dependency.

15117

if (!VAUsedInPlace)

15118

VA = DAG.getUNDEF(MVT::v4f32);

15119

15120

// Update V1, V2 and InsertPSMask accordingly.

15121

V1 = VA;

15122

V2 = VB;

15123

15124

// Insert the V2 element into the desired position.

15125

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

15126

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15126, __extension__
__PRETTY_FUNCTION__));

15127

return true;

15128

};

15129

15130

if (matchAsInsertPS(V1, V2, Mask))

15131

return true;

15132

15133

// Commute and try again.

15134

SmallVector<int, 4> CommutedMask(Mask);

15135

ShuffleVectorSDNode::commuteMask(CommutedMask);

15136

if (matchAsInsertPS(V2, V1, CommutedMask))

15137

return true;

15138

15139

return false;

15140

}

15141

15142

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

15143

ArrayRef<int> Mask, const APInt &Zeroable,

15144

SelectionDAG &DAG) {

15145

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15145, __extension__
__PRETTY_FUNCTION__));

15146

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15146, __extension__
__PRETTY_FUNCTION__));

15147

15148

// Attempt to match the insertps pattern.

15149

unsigned InsertPSMask = 0;

15150

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

15151

return SDValue();

15152

15153

// Insert the V2 element into the desired position.

15154

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

15155

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

15156

}

15157

15158

/// Handle lowering of 2-lane 64-bit floating point shuffles.

15159

///

15160

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

15161

/// support for floating point shuffles but not integer shuffles. These

15162

/// instructions will incur a domain crossing penalty on some chips though so

15163

/// it is better to avoid lowering through this for integer vectors where

15164

/// possible.

15165

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15166

const APInt &Zeroable, SDValue V1, SDValue V2,

15167

const X86Subtarget &Subtarget,

15168

SelectionDAG &DAG) {

15169

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15169, __extension__
__PRETTY_FUNCTION__));

15170

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15170, __extension__
__PRETTY_FUNCTION__));

15171

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15171, __extension__
__PRETTY_FUNCTION__));

15172

15173

if (V2.isUndef()) {

15174

// Check for being able to broadcast a single element.

15175

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

15176

Mask, Subtarget, DAG))

15177

return Broadcast;

15178

15179

// Straight shuffle of a single input vector. Simulate this by using the

15180

// single input as both of the "inputs" to this instruction..

15181

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

15182

15183

if (Subtarget.hasAVX()) {

15184

// If we have AVX, we can use VPERMILPS which will allow folding a load

15185

// into the shuffle.

15186

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

15187

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15188

}

15189

15190

return DAG.getNode(

15191

X86ISD::SHUFP, DL, MVT::v2f64,

15192

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15193

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15194

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15195

}

15196

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15196, __extension__
__PRETTY_FUNCTION__));

15197

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15197, __extension__
__PRETTY_FUNCTION__));

15198

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15198, __extension__
__PRETTY_FUNCTION__));

15199

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15199, __extension__
__PRETTY_FUNCTION__));

15200

15201

if (Subtarget.hasAVX2())

15202

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15203

return Extract;

15204

15205

// When loading a scalar and then shuffling it into a vector we can often do

15206

// the insertion cheaply.

15207

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15208

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15209

return Insertion;

15210

// Try inverting the insertion since for v2 masks it is easy to do and we

15211

// can't reliably sort the mask one way or the other.

15212

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

15213

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

15214

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15215

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15216

return Insertion;

15217

15218

// Try to use one of the special instruction patterns to handle two common

15219

// blend patterns if a zero-blend above didn't work.

15220

if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

15221

isShuffleEquivalent(Mask, {1, 3}, V1, V2))

15222

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

15223

// We can either use a special instruction to load over the low double or

15224

// to move just the low double.

15225

return DAG.getNode(

15226

X86ISD::MOVSD, DL, MVT::v2f64, V2,

15227

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

15228

15229

if (Subtarget.hasSSE41())

15230

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

15231

Zeroable, Subtarget, DAG))

15232

return Blend;

15233

15234

// Use dedicated unpack instructions for masks that match their pattern.

15235

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

15236

return V;

15237

15238

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

15239

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

15240

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15241

}

15242

15243

/// Handle lowering of 2-lane 64-bit integer shuffles.

15244

///

15245

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

15246

/// the integer unit to minimize domain crossing penalties. However, for blends

15247

/// it falls back to the floating point shuffle operation with appropriate bit

15248

/// casting.

15249

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15250

const APInt &Zeroable, SDValue V1, SDValue V2,

15251

const X86Subtarget &Subtarget,

15252

SelectionDAG &DAG) {

15253

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15253, __extension__
__PRETTY_FUNCTION__));

15254

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15254, __extension__
__PRETTY_FUNCTION__));

15255

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15255, __extension__
__PRETTY_FUNCTION__));

15256

15257

if (V2.isUndef()) {

15258

// Check for being able to broadcast a single element.

15259

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

15260

Mask, Subtarget, DAG))

15261

return Broadcast;

15262

15263

// Straight shuffle of a single input vector. For everything from SSE2

15264

// onward this has a single fast instruction with no scary immediates.

15265

// We have to map the mask as it is actually a v4i32 shuffle instruction.

15266

V1 = DAG.getBitcast(MVT::v4i32, V1);

15267

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

15268

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

15269

Mask[1] < 0 ? -1 : (Mask[1] * 2),

15270

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

15271

return DAG.getBitcast(

15272

MVT::v2i64,

15273

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15274

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

15275

}

15276

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15276, __extension__
__PRETTY_FUNCTION__));

15277

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15277, __extension__
__PRETTY_FUNCTION__));

15278

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15278, __extension__
__PRETTY_FUNCTION__));

15279

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__));

15280

15281

if (Subtarget.hasAVX2())

15282

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15283

return Extract;

15284

15285

// Try to use shift instructions.

15286

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,

15287

Zeroable, Subtarget, DAG))

15288

return Shift;

15289

15290

// When loading a scalar and then shuffling it into a vector we can often do

15291

// the insertion cheaply.

15292

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15293

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15294

return Insertion;

15295

// Try inverting the insertion since for v2 masks it is easy to do and we

15296

// can't reliably sort the mask one way or the other.

15297

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

15298

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15299

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15300

return Insertion;

15301

15302

// We have different paths for blend lowering, but they all must use the

15303

// *exact* same predicate.

15304

bool IsBlendSupported = Subtarget.hasSSE41();

15305

if (IsBlendSupported)

15306

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

15307

Zeroable, Subtarget, DAG))

15308

return Blend;

15309

15310

// Use dedicated unpack instructions for masks that match their pattern.

15311

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

15312

return V;

15313

15314

// Try to use byte rotation instructions.

15315

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15316

if (Subtarget.hasSSSE3()) {

15317

if (Subtarget.hasVLX())

15318

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

15319

Subtarget, DAG))

15320

return Rotate;

15321

15322

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

15323

Subtarget, DAG))

15324

return Rotate;

15325

}

15326

15327

// If we have direct support for blends, we should lower by decomposing into

15328

// a permute. That will be faster than the domain cross.

15329

if (IsBlendSupported)

15330

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

15331

Subtarget, DAG);

15332

15333

// We implement this with SHUFPD which is pretty lame because it will likely

15334

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

15335

// However, all the alternatives are still more cycles and newer chips don't

15336

// have this problem. It would be really nice if x86 had better shuffles here.

15337

V1 = DAG.getBitcast(MVT::v2f64, V1);

15338

V2 = DAG.getBitcast(MVT::v2f64, V2);

15339

return DAG.getBitcast(MVT::v2i64,

15340

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

15341

}

15342

15343

/// Lower a vector shuffle using the SHUFPS instruction.

15344

///

15345

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

15346

/// It makes no assumptions about whether this is the *best* lowering, it simply

15347

/// uses it.

15348

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

15349

ArrayRef<int> Mask, SDValue V1,

15350

SDValue V2, SelectionDAG &DAG) {

15351

SDValue LowV = V1, HighV = V2;

15352

SmallVector<int, 4> NewMask(Mask);

15353

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15354

15355

if (NumV2Elements == 1) {

15356

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

15357

15358

// Compute the index adjacent to V2Index and in the same half by toggling

15359

// the low bit.

15360

int V2AdjIndex = V2Index ^ 1;

15361

15362

if (Mask[V2AdjIndex] < 0) {

15363

// Handles all the cases where we have a single V2 element and an undef.

15364

// This will only ever happen in the high lanes because we commute the

15365

// vector otherwise.

15366

if (V2Index < 2)

15367

std::swap(LowV, HighV);

15368

NewMask[V2Index] -= 4;

15369

} else {

15370

// Handle the case where the V2 element ends up adjacent to a V1 element.

15371

// To make this work, blend them together as the first step.

15372

int V1Index = V2AdjIndex;

15373

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

15374

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

15375

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15376

15377

// Now proceed to reconstruct the final blend as we have the necessary

15378

// high or low half formed.

15379

if (V2Index < 2) {

15380

LowV = V2;

15381

HighV = V1;

15382

} else {

15383

HighV = V2;

15384

}

15385

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

15386

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

15387

}

15388

} else if (NumV2Elements == 2) {

15389

if (Mask[0] < 4 && Mask[1] < 4) {

15390

// Handle the easy case where we have V1 in the low lanes and V2 in the

15391

// high lanes.

15392

NewMask[2] -= 4;

15393

NewMask[3] -= 4;

15394

} else if (Mask[2] < 4 && Mask[3] < 4) {

15395

// We also handle the reversed case because this utility may get called

15396

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

15397

// arrange things in the right direction.

15398

NewMask[0] -= 4;

15399

NewMask[1] -= 4;

15400

HighV = V1;

15401

LowV = V2;

15402

} else {

15403

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

15404

// trying to place elements directly, just blend them and set up the final

15405

// shuffle to place them.

15406

15407

// The first two blend mask elements are for V1, the second two are for

15408

// V2.

15409

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

15410

Mask[2] < 4 ? Mask[2] : Mask[3],

15411

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

15412

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

15413

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

15414

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15415

15416

// Now we do a normal shuffle of V1 by giving V1 as both operands to

15417

// a blend.

15418

LowV = HighV = V1;

15419

NewMask[0] = Mask[0] < 4 ? 0 : 2;

15420

NewMask[1] = Mask[0] < 4 ? 2 : 0;

15421

NewMask[2] = Mask[2] < 4 ? 1 : 3;

15422

NewMask[3] = Mask[2] < 4 ? 3 : 1;

15423

}

15424

} else if (NumV2Elements == 3) {

15425

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

15426

// we can get here due to other paths (e.g repeated mask matching) that we

15427

// don't want to do another round of lowerVECTOR_SHUFFLE.

15428

ShuffleVectorSDNode::commuteMask(NewMask);

15429

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

15430

}

15431

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

15432

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

15433

}

15434

15435

/// Lower 4-lane 32-bit floating point shuffles.

15436

///

15437

/// Uses instructions exclusively from the floating point unit to minimize

15438

/// domain crossing penalties, as these are sufficient to implement all v4f32

15439

/// shuffles.

15440

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15441

const APInt &Zeroable, SDValue V1, SDValue V2,

15442

const X86Subtarget &Subtarget,

15443

SelectionDAG &DAG) {

15444

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15444, __extension__
__PRETTY_FUNCTION__));

15445

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15445, __extension__
__PRETTY_FUNCTION__));

15446

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15446, __extension__
__PRETTY_FUNCTION__));

15447

15448

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15449

15450

if (NumV2Elements == 0) {

15451

// Check for being able to broadcast a single element.

15452

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

15453

Mask, Subtarget, DAG))

15454

return Broadcast;

15455

15456

// Use even/odd duplicate instructions for masks that match their pattern.

15457

if (Subtarget.hasSSE3()) {

15458

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

15459

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

15460

if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

15461

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

15462

}

15463

15464

if (Subtarget.hasAVX()) {

15465

// If we have AVX, we can use VPERMILPS which will allow folding a load

15466

// into the shuffle.

15467

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

15468

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15469

}

15470

15471

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

15472

// in SSE1 because otherwise they are widened to v2f64 and never get here.

15473

if (!Subtarget.hasSSE2()) {

15474

if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

15475

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

15476

if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

15477

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

15478

}

15479

15480

// Otherwise, use a straight shuffle of a single input vector. We pass the

15481

// input vector to both operands to simulate this with a SHUFPS.

15482

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

15483

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15484

}

15485

15486

if (Subtarget.hasAVX2())

15487

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15488

return Extract;

15489

15490

// There are special ways we can lower some single-element blends. However, we

15491

// have custom ways we can lower more complex single-element blends below that

15492

// we defer to if both this and BLENDPS fail to match, so restrict this to

15493

// when the V2 input is targeting element 0 of the mask -- that is the fast

15494

// case here.

15495

if (NumV2Elements == 1 && Mask[0] >= 4)

15496

if (SDValue V = lowerShuffleAsElementInsertion(

15497

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15498

return V;

15499

15500

if (Subtarget.hasSSE41()) {

15501

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

15502

Zeroable, Subtarget, DAG))

15503

return Blend;

15504

15505

// Use INSERTPS if we can complete the shuffle efficiently.

15506

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

15507

return V;

15508

15509

if (!isSingleSHUFPSMask(Mask))

15510

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

15511

V2, Mask, DAG))

15512

return BlendPerm;

15513

}

15514

15515

// Use low/high mov instructions. These are only valid in SSE1 because

15516

// otherwise they are widened to v2f64 and never get here.

15517

if (!Subtarget.hasSSE2()) {

15518

if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

15519

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

15520

if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

15521

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

15522

}

15523

15524

// Use dedicated unpack instructions for masks that match their pattern.

15525

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

15526

return V;

15527

15528

// Otherwise fall back to a SHUFPS lowering strategy.

15529

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

15530

}

15531

15532

/// Lower 4-lane i32 vector shuffles.

15533

///

15534

/// We try to handle these with integer-domain shuffles where we can, but for

15535

/// blends we use the floating point domain blend instructions.

15536

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15537

const APInt &Zeroable, SDValue V1, SDValue V2,

15538

const X86Subtarget &Subtarget,

15539

SelectionDAG &DAG) {

15540

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15540, __extension__
__PRETTY_FUNCTION__));

15541

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15541, __extension__
__PRETTY_FUNCTION__));

15542

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15542, __extension__
__PRETTY_FUNCTION__));

15543

15544

// Whenever we can lower this as a zext, that instruction is strictly faster

15545

// than any alternative. It also allows us to fold memory operands into the

15546

// shuffle in many cases.

15547

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

15548

Zeroable, Subtarget, DAG))

15549

return ZExt;

15550

15551

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15552

15553

if (NumV2Elements == 0) {

15554

// Try to use broadcast unless the mask only has one non-undef element.

15555

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

15556

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

15557

Mask, Subtarget, DAG))

15558

return Broadcast;

15559

}

15560

15561

// Straight shuffle of a single input vector. For everything from SSE2

15562

// onward this has a single fast instruction with no scary immediates.

15563

// We coerce the shuffle pattern to be compatible with UNPCK instructions

15564

// but we aren't actually going to use the UNPCK instruction because doing

15565

// so prevents folding a load into this instruction or making a copy.

15566

const int UnpackLoMask[] = {0, 0, 1, 1};

15567

const int UnpackHiMask[] = {2, 2, 3, 3};

15568

if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

15569

Mask = UnpackLoMask;

15570

else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

15571

Mask = UnpackHiMask;

15572

15573

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15574

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15575

}

15576

15577

if (Subtarget.hasAVX2())

15578

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15579

return Extract;

15580

15581

// Try to use shift instructions.

15582

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,

15583

Zeroable, Subtarget, DAG))

15584

return Shift;

15585

15586

// There are special ways we can lower some single-element blends.

15587

if (NumV2Elements == 1)

15588

if (SDValue V = lowerShuffleAsElementInsertion(

15589

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15590

return V;

15591

15592

// We have different paths for blend lowering, but they all must use the

15593

// *exact* same predicate.

15594

bool IsBlendSupported = Subtarget.hasSSE41();

15595

if (IsBlendSupported)

15596

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

15597

Zeroable, Subtarget, DAG))

15598

return Blend;

15599

15600

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

15601

Zeroable, Subtarget, DAG))

15602

return Masked;

15603

15604

// Use dedicated unpack instructions for masks that match their pattern.

15605

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

15606

return V;

15607

15608

// Try to use byte rotation instructions.

15609

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15610

if (Subtarget.hasSSSE3()) {

15611

if (Subtarget.hasVLX())

15612

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

15613

Subtarget, DAG))

15614

return Rotate;

15615

15616

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

15617

Subtarget, DAG))

15618

return Rotate;

15619

}

15620

15621

// Assume that a single SHUFPS is faster than an alternative sequence of

15622

// multiple instructions (even if the CPU has a domain penalty).

15623

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

15624

if (!isSingleSHUFPSMask(Mask)) {

15625

// If we have direct support for blends, we should lower by decomposing into

15626

// a permute. That will be faster than the domain cross.

15627

if (IsBlendSupported)

15628

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

15629

Subtarget, DAG);

15630

15631

// Try to lower by permuting the inputs into an unpack instruction.

15632

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

15633

Mask, Subtarget, DAG))

15634

return Unpack;

15635

}

15636

15637

// We implement this with SHUFPS because it can blend from two vectors.

15638

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

15639

// up the inputs, bypassing domain shift penalties that we would incur if we

15640

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

15641

// relevant.

15642

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

15643

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

15644

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

15645

return DAG.getBitcast(MVT::v4i32, ShufPS);

15646

}

15647

15648

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

15649

/// shuffle lowering, and the most complex part.

15650

///

15651

/// The lowering strategy is to try to form pairs of input lanes which are

15652

/// targeted at the same half of the final vector, and then use a dword shuffle

15653

/// to place them onto the right half, and finally unpack the paired lanes into

15654

/// their final position.

15655

///

15656

/// The exact breakdown of how to form these dword pairs and align them on the

15657

/// correct sides is really tricky. See the comments within the function for

15658

/// more of the details.

15659

///

15660

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

15661

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

15662

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

15663

/// vector, form the analogous 128-bit 8-element Mask.

15664

static SDValue lowerV8I16GeneralSingleInputShuffle(

15665

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

15666

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15667

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15667, __extension__
__PRETTY_FUNCTION__));

15668

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

15669

15670

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15670, __extension__
__PRETTY_FUNCTION__));

15671

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

15672

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

15673

15674

// Attempt to directly match PSHUFLW or PSHUFHW.

15675

if (isUndefOrInRange(LoMask, 0, 4) &&

15676

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

15677

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15678

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15679

}

15680

if (isUndefOrInRange(HiMask, 4, 8) &&

15681

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

15682

for (int i = 0; i != 4; ++i)

15683

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

15684

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15685

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15686

}

15687

15688

SmallVector<int, 4> LoInputs;

15689

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

15690

array_pod_sort(LoInputs.begin(), LoInputs.end());

15691

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

15692

SmallVector<int, 4> HiInputs;

15693

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

15694

array_pod_sort(HiInputs.begin(), HiInputs.end());

15695

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

15696

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

15697

int NumHToL = LoInputs.size() - NumLToL;

15698

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

15699

int NumHToH = HiInputs.size() - NumLToH;

15700

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

15701

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

15702

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

15703

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

15704

15705

// If we are shuffling values from one half - check how many different DWORD

15706

// pairs we need to create. If only 1 or 2 then we can perform this as a

15707

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

15708

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

15709

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

15710

V = DAG.getNode(ShufWOp, DL, VT, V,

15711

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15712

V = DAG.getBitcast(PSHUFDVT, V);

15713

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

15714

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

15715

return DAG.getBitcast(VT, V);

15716

};

15717

15718

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

15719

int PSHUFDMask[4] = { -1, -1, -1, -1 };

15720

SmallVector<std::pair<int, int>, 4> DWordPairs;

15721

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

15722

15723

// Collect the different DWORD pairs.

15724

for (int DWord = 0; DWord != 4; ++DWord) {

15725

int M0 = Mask[2 * DWord + 0];

15726

int M1 = Mask[2 * DWord + 1];

15727

M0 = (M0 >= 0 ? M0 % 4 : M0);

15728

M1 = (M1 >= 0 ? M1 % 4 : M1);

15729

if (M0 < 0 && M1 < 0)

15730

continue;

15731

15732

bool Match = false;

15733

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

15734

auto &DWordPair = DWordPairs[j];

15735

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

15736

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

15737

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

15738

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

15739

PSHUFDMask[DWord] = DOffset + j;

15740

Match = true;

15741

break;

15742

}

15743

}

15744

if (!Match) {

15745

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

15746

DWordPairs.push_back(std::make_pair(M0, M1));

15747

}

15748

}

15749

15750

if (DWordPairs.size() <= 2) {

15751

DWordPairs.resize(2, std::make_pair(-1, -1));

15752

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

15753

DWordPairs[1].first, DWordPairs[1].second};

15754

if ((NumHToL + NumHToH) == 0)

15755

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

15756

if ((NumLToL + NumLToH) == 0)

15757

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

15758

}

15759

}

15760

15761

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

15762

// such inputs we can swap two of the dwords across the half mark and end up

15763

// with <=2 inputs to each half in each half. Once there, we can fall through

15764

// to the generic code below. For example:

15765

//

15766

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15767

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

15768

//

15769

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

15770

// and an existing 2-into-2 on the other half. In this case we may have to

15771

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

15772

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

15773

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

15774

// because any other situation (including a 3-into-1 or 1-into-3 in the other

15775

// half than the one we target for fixing) will be fixed when we re-enter this

15776

// path. We will also combine away any sequence of PSHUFD instructions that

15777

// result into a single instruction. Here is an example of the tricky case:

15778

//

15779

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15780

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

15781

//

15782

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

15783

//

15784

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

15785

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

15786

//

15787

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

15788

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

15789

//

15790

// The result is fine to be handled by the generic logic.

15791

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

15792

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

15793

int AOffset, int BOffset) {

15794

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15795, __extension__
__PRETTY_FUNCTION__))

15795

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15795, __extension__
__PRETTY_FUNCTION__));

15796

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15797, __extension__
__PRETTY_FUNCTION__))

15797

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15797, __extension__
__PRETTY_FUNCTION__));

15798

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15799, __extension__
__PRETTY_FUNCTION__))

15799

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15799, __extension__
__PRETTY_FUNCTION__));

15800

15801

bool ThreeAInputs = AToAInputs.size() == 3;

15802

15803

// Compute the index of dword with only one word among the three inputs in

15804

// a half by taking the sum of the half with three inputs and subtracting

15805

// the sum of the actual three inputs. The difference is the remaining

15806

// slot.

15807

int ADWord = 0, BDWord = 0;

15808

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

15809

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

15810

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

15811

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

15812

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

15813

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

15814

int TripleNonInputIdx =

15815

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

15816

TripleDWord = TripleNonInputIdx / 2;

15817

15818

// We use xor with one to compute the adjacent DWord to whichever one the

15819

// OneInput is in.

15820

OneInputDWord = (OneInput / 2) ^ 1;

15821

15822

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

15823

// and BToA inputs. If there is also such a problem with the BToB and AToB

15824

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

15825

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

15826

// is essential that we don't *create* a 3<-1 as then we might oscillate.

15827

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

15828

// Compute how many inputs will be flipped by swapping these DWords. We

15829

// need

15830

// to balance this to ensure we don't form a 3-1 shuffle in the other

15831

// half.

15832

int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

15833

llvm::count(AToBInputs, 2 * ADWord + 1);

15834

int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

15835

llvm::count(BToBInputs, 2 * BDWord + 1);

15836

if ((NumFlippedAToBInputs == 1 &&

15837

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

15838

(NumFlippedBToBInputs == 1 &&

15839

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

15840

// We choose whether to fix the A half or B half based on whether that

15841

// half has zero flipped inputs. At zero, we may not be able to fix it

15842

// with that half. We also bias towards fixing the B half because that

15843

// will more commonly be the high half, and we have to bias one way.

15844

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

15845

ArrayRef<int> Inputs) {

15846

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

15847

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

15848

// Determine whether the free index is in the flipped dword or the

15849

// unflipped dword based on where the pinned index is. We use this bit

15850

// in an xor to conditionally select the adjacent dword.

15851

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

15852

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15853

if (IsFixIdxInput == IsFixFreeIdxInput)

15854

FixFreeIdx += 1;

15855

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15856

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15857, __extension__
__PRETTY_FUNCTION__))

15857

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15857, __extension__
__PRETTY_FUNCTION__));

15858

int PSHUFHalfMask[] = {0, 1, 2, 3};

15859

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

15860

V = DAG.getNode(

15861

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

15862

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

15863

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15864

15865

for (int &M : Mask)

15866

if (M >= 0 && M == FixIdx)

15867

M = FixFreeIdx;

15868

else if (M >= 0 && M == FixFreeIdx)

15869

M = FixIdx;

15870

};

15871

if (NumFlippedBToBInputs != 0) {

15872

int BPinnedIdx =

15873

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

15874

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

15875

} else {

15876

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15876, __extension__
__PRETTY_FUNCTION__));

15877

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

15878

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

15879

}

15880

}

15881

}

15882

15883

int PSHUFDMask[] = {0, 1, 2, 3};

15884

PSHUFDMask[ADWord] = BDWord;

15885

PSHUFDMask[BDWord] = ADWord;

15886

V = DAG.getBitcast(

15887

VT,

15888

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

15889

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

15890

15891

// Adjust the mask to match the new locations of A and B.

15892

for (int &M : Mask)

15893

if (M >= 0 && M/2 == ADWord)

15894

M = 2 * BDWord + M % 2;

15895

else if (M >= 0 && M/2 == BDWord)

15896

M = 2 * ADWord + M % 2;

15897

15898

// Recurse back into this routine to re-compute state now that this isn't

15899

// a 3 and 1 problem.

15900

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

15901

};

15902

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

15903

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

15904

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

15905

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

15906

15907

// At this point there are at most two inputs to the low and high halves from

15908

// each half. That means the inputs can always be grouped into dwords and

15909

// those dwords can then be moved to the correct half with a dword shuffle.

15910

// We use at most one low and one high word shuffle to collect these paired

15911

// inputs into dwords, and finally a dword shuffle to place them.

15912

int PSHUFLMask[4] = {-1, -1, -1, -1};

15913

int PSHUFHMask[4] = {-1, -1, -1, -1};

15914

int PSHUFDMask[4] = {-1, -1, -1, -1};

15915

15916

// First fix the masks for all the inputs that are staying in their

15917

// original halves. This will then dictate the targets of the cross-half

15918

// shuffles.

15919

auto fixInPlaceInputs =

15920

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

15921

MutableArrayRef<int> SourceHalfMask,

15922

MutableArrayRef<int> HalfMask, int HalfOffset) {

15923

if (InPlaceInputs.empty())

15924

return;

15925

if (InPlaceInputs.size() == 1) {

15926

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15927

InPlaceInputs[0] - HalfOffset;

15928

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

15929

return;

15930

}

15931

if (IncomingInputs.empty()) {

15932

// Just fix all of the in place inputs.

15933

for (int Input : InPlaceInputs) {

15934

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

15935

PSHUFDMask[Input / 2] = Input / 2;

15936

}

15937

return;

15938

}

15939

15940

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15940, __extension__
__PRETTY_FUNCTION__));

15941

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15942

InPlaceInputs[0] - HalfOffset;

15943

// Put the second input next to the first so that they are packed into

15944

// a dword. We find the adjacent index by toggling the low bit.

15945

int AdjIndex = InPlaceInputs[0] ^ 1;

15946

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

15947

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

15948

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

15949

};

15950

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

15951

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

15952

15953

// Now gather the cross-half inputs and place them into a free dword of

15954

// their target half.

15955

// FIXME: This operation could almost certainly be simplified dramatically to

15956

// look more like the 3-1 fixing operation.

15957

auto moveInputsToRightHalf = [&PSHUFDMask](

15958

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

15959

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

15960

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

15961

int DestOffset) {

15962

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

15963

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

15964

};

15965

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

15966

int Word) {

15967

int LowWord = Word & ~1;

15968

int HighWord = Word | 1;

15969

return isWordClobbered(SourceHalfMask, LowWord) ||

15970

isWordClobbered(SourceHalfMask, HighWord);

15971

};

15972

15973

if (IncomingInputs.empty())

15974

return;

15975

15976

if (ExistingInputs.empty()) {

15977

// Map any dwords with inputs from them into the right half.

15978

for (int Input : IncomingInputs) {

15979

// If the source half mask maps over the inputs, turn those into

15980

// swaps and use the swapped lane.

15981

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

15982

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

15983

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

15984

Input - SourceOffset;

15985

// We have to swap the uses in our half mask in one sweep.

15986

for (int &M : HalfMask)

15987

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

15988

M = Input;

15989

else if (M == Input)

15990

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

15991

} else {

15992

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15994, __extension__
__PRETTY_FUNCTION__))

15993

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15994, __extension__
__PRETTY_FUNCTION__))

15994

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15994, __extension__
__PRETTY_FUNCTION__));

15995

}

15996

// Note that this correctly re-maps both when we do a swap and when

15997

// we observe the other side of the swap above. We rely on that to

15998

// avoid swapping the members of the input list directly.

15999

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16000

}

16001

16002

// Map the input's dword into the correct half.

16003

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

16004

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

16005

else

16006

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16008, __extension__
__PRETTY_FUNCTION__))

16007

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16008, __extension__
__PRETTY_FUNCTION__))

16008

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16008, __extension__
__PRETTY_FUNCTION__));

16009

}

16010

16011

// And just directly shift any other-half mask elements to be same-half

16012

// as we will have mirrored the dword containing the element into the

16013

// same position within that half.

16014

for (int &M : HalfMask)

16015

if (M >= SourceOffset && M < SourceOffset + 4) {

16016

M = M - SourceOffset + DestOffset;

16017

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16017, __extension__
__PRETTY_FUNCTION__));

16018

}

16019

return;

16020

}

16021

16022

// Ensure we have the input in a viable dword of its current half. This

16023

// is particularly tricky because the original position may be clobbered

16024

// by inputs being moved and *staying* in that half.

16025

if (IncomingInputs.size() == 1) {

16026

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16027

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

16028

SourceOffset;

16029

SourceHalfMask[InputFixed - SourceOffset] =

16030

IncomingInputs[0] - SourceOffset;

16031

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

16032

InputFixed);

16033

IncomingInputs[0] = InputFixed;

16034

}

16035

} else if (IncomingInputs.size() == 2) {

16036

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

16037

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16038

// We have two non-adjacent or clobbered inputs we need to extract from

16039

// the source half. To do this, we need to map them into some adjacent

16040

// dword slot in the source mask.

16041

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

16042

IncomingInputs[1] - SourceOffset};

16043

16044

// If there is a free slot in the source half mask adjacent to one of

16045

// the inputs, place the other input in it. We use (Index XOR 1) to

16046

// compute an adjacent index.

16047

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

16048

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

16049

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

16050

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16051

InputsFixed[1] = InputsFixed[0] ^ 1;

16052

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

16053

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

16054

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

16055

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

16056

InputsFixed[0] = InputsFixed[1] ^ 1;

16057

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

16058

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

16059

// The two inputs are in the same DWord but it is clobbered and the

16060

// adjacent DWord isn't used at all. Move both inputs to the free

16061

// slot.

16062

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

16063

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

16064

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

16065

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

16066

} else {

16067

// The only way we hit this point is if there is no clobbering

16068

// (because there are no off-half inputs to this half) and there is no

16069

// free slot adjacent to one of the inputs. In this case, we have to

16070

// swap an input with a non-input.

16071

for (int i = 0; i < 4; ++i)

16072

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16073, __extension__
__PRETTY_FUNCTION__))

16073

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16073, __extension__
__PRETTY_FUNCTION__));

16074

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16075, __extension__
__PRETTY_FUNCTION__))

16075

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16075, __extension__
__PRETTY_FUNCTION__));

16076

16077

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16078

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

16079

16080

// We also have to update the final source mask in this case because

16081

// it may need to undo the above swap.

16082

for (int &M : FinalSourceHalfMask)

16083

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

16084

M = InputsFixed[1] + SourceOffset;

16085

else if (M == InputsFixed[1] + SourceOffset)

16086

M = (InputsFixed[0] ^ 1) + SourceOffset;

16087

16088

InputsFixed[1] = InputsFixed[0] ^ 1;

16089

}

16090

16091

// Point everything at the fixed inputs.

16092

for (int &M : HalfMask)

16093

if (M == IncomingInputs[0])

16094

M = InputsFixed[0] + SourceOffset;

16095

else if (M == IncomingInputs[1])

16096

M = InputsFixed[1] + SourceOffset;

16097

16098

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

16099

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

16100

}

16101

} else {

16102

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16102);

16103

}

16104

16105

// Now hoist the DWord down to the right half.

16106

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

16107

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16107, __extension__
__PRETTY_FUNCTION__));

16108

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

16109

for (int &M : HalfMask)

16110

for (int Input : IncomingInputs)

16111

if (M == Input)

16112

M = FreeDWord * 2 + Input % 2;

16113

};

16114

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

16115

/*SourceOffset*/ 4, /*DestOffset*/ 0);

16116

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

16117

/*SourceOffset*/ 0, /*DestOffset*/ 4);

16118

16119

// Now enact all the shuffles we've computed to move the inputs into their

16120

// target half.

16121

if (!isNoopShuffleMask(PSHUFLMask))

16122

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16123

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

16124

if (!isNoopShuffleMask(PSHUFHMask))

16125

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16126

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

16127

if (!isNoopShuffleMask(PSHUFDMask))

16128

V = DAG.getBitcast(

16129

VT,

16130

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

16131

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16132

16133

// At this point, each half should contain all its inputs, and we can then

16134

// just shuffle them into their final position.

16135

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16136, __extension__
__PRETTY_FUNCTION__))

16136

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16136, __extension__
__PRETTY_FUNCTION__));

16137

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16138, __extension__
__PRETTY_FUNCTION__))

16138

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16138, __extension__
__PRETTY_FUNCTION__));

16139

16140

// Do a half shuffle for the low mask.

16141

if (!isNoopShuffleMask(LoMask))

16142

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16143

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

16144

16145

// Do a half shuffle with the high mask after shifting its values down.

16146

for (int &M : HiMask)

16147

if (M >= 0)

16148

M -= 4;

16149

if (!isNoopShuffleMask(HiMask))

16150

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16151

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

16152

16153

return V;

16154

}

16155

16156

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

16157

/// blend if only one input is used.

16158

static SDValue lowerShuffleAsBlendOfPSHUFBs(

16159

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16160

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

16161

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__))

16162

"Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__));

16163

16164

int NumBytes = VT.getSizeInBits() / 8;

16165

int Size = Mask.size();

16166

int Scale = NumBytes / Size;

16167

16168

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16169

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16170

V1InUse = false;

16171

V2InUse = false;

16172

16173

for (int i = 0; i < NumBytes; ++i) {

16174

int M = Mask[i / Scale];

16175

if (M < 0)

16176

continue;

16177

16178

const int ZeroMask = 0x80;

16179

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

16180

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

16181

if (Zeroable[i / Scale])

16182

V1Idx = V2Idx = ZeroMask;

16183

16184

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

16185

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

16186

V1InUse |= (ZeroMask != V1Idx);

16187

V2InUse |= (ZeroMask != V2Idx);

16188

}

16189

16190

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

16191

if (V1InUse)

16192

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

16193

DAG.getBuildVector(ShufVT, DL, V1Mask));

16194

if (V2InUse)

16195

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

16196

DAG.getBuildVector(ShufVT, DL, V2Mask));

16197

16198

// If we need shuffled inputs from both, blend the two.

16199

SDValue V;

16200

if (V1InUse && V2InUse)

16201

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

16202

else

16203

V = V1InUse ? V1 : V2;

16204

16205

// Cast the result back to the correct type.

16206

return DAG.getBitcast(VT, V);

16207

}

16208

16209

/// Generic lowering of 8-lane i16 shuffles.

16210

///

16211

/// This handles both single-input shuffles and combined shuffle/blends with

16212

/// two inputs. The single input shuffles are immediately delegated to

16213

/// a dedicated lowering routine.

16214

///

16215

/// The blends are lowered in one of three fundamental ways. If there are few

16216

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

16217

/// of the input is significantly cheaper when lowered as an interleaving of

16218

/// the two inputs, try to interleave them. Otherwise, blend the low and high

16219

/// halves of the inputs separately (making them have relatively few inputs)

16220

/// and then concatenate them.

16221

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16222

const APInt &Zeroable, SDValue V1, SDValue V2,

16223

const X86Subtarget &Subtarget,

16224

SelectionDAG &DAG) {

16225

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16225, __extension__
__PRETTY_FUNCTION__));

16226

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16226, __extension__
__PRETTY_FUNCTION__));

16227

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16227, __extension__
__PRETTY_FUNCTION__));

16228

16229

// Whenever we can lower this as a zext, that instruction is strictly faster

16230

// than any alternative.

16231

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

16232

Zeroable, Subtarget, DAG))

16233

return ZExt;

16234

16235

// Try to use lower using a truncation.

16236

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16237

Subtarget, DAG))

16238

return V;

16239

16240

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

16241

16242

if (NumV2Inputs == 0) {

16243

// Try to use shift instructions.

16244

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,

16245

Zeroable, Subtarget, DAG))

16246

return Shift;

16247

16248

// Check for being able to broadcast a single element.

16249

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

16250

Mask, Subtarget, DAG))

16251

return Broadcast;

16252

16253

// Try to use bit rotation instructions.

16254

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

16255

Subtarget, DAG))

16256

return Rotate;

16257

16258

// Use dedicated unpack instructions for masks that match their pattern.

16259

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16260

return V;

16261

16262

// Use dedicated pack instructions for masks that match their pattern.

16263

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16264

Subtarget))

16265

return V;

16266

16267

// Try to use byte rotation instructions.

16268

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

16269

Subtarget, DAG))

16270

return Rotate;

16271

16272

// Make a copy of the mask so it can be modified.

16273

SmallVector<int, 8> MutableMask(Mask);

16274

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

16275

Subtarget, DAG);

16276

}

16277

16278

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16280, __extension__
__PRETTY_FUNCTION__))

16279

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16280, __extension__
__PRETTY_FUNCTION__))

16280

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16280, __extension__
__PRETTY_FUNCTION__));

16281

16282

// Try to use shift instructions.

16283

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,

16284

Zeroable, Subtarget, DAG))

16285

return Shift;

16286

16287

// See if we can use SSE4A Extraction / Insertion.

16288

if (Subtarget.hasSSE4A())

16289

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

16290

Zeroable, DAG))

16291

return V;

16292

16293

// There are special ways we can lower some single-element blends.

16294

if (NumV2Inputs == 1)

16295

if (SDValue V = lowerShuffleAsElementInsertion(

16296

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16297

return V;

16298

16299

// We have different paths for blend lowering, but they all must use the

16300

// *exact* same predicate.

16301

bool IsBlendSupported = Subtarget.hasSSE41();

16302

if (IsBlendSupported)

16303

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

16304

Zeroable, Subtarget, DAG))

16305

return Blend;

16306

16307

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

16308

Zeroable, Subtarget, DAG))

16309

return Masked;

16310

16311

// Use dedicated unpack instructions for masks that match their pattern.

16312

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16313

return V;

16314

16315

// Use dedicated pack instructions for masks that match their pattern.

16316

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16317

Subtarget))

16318

return V;

16319

16320

// Try to use lower using a truncation.

16321

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16322

Subtarget, DAG))

16323

return V;

16324

16325

// Try to use byte rotation instructions.

16326

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

16327

Subtarget, DAG))

16328

return Rotate;

16329

16330

if (SDValue BitBlend =

16331

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

16332

return BitBlend;

16333

16334

// Try to use byte shift instructions to mask.

16335

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

16336

Zeroable, Subtarget, DAG))

16337

return V;

16338

16339

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

16340

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

16341

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

16342

int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

16343

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

16344

!Subtarget.hasVLX()) {

16345

// Check if this is part of a 256-bit vector truncation.

16346

if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

16347

peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

16348

peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

16349

SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

16350

V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

16351

getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

16352

DAG.getTargetConstant(0xEE, DL, MVT::i8));

16353

V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

16354

V1 = extract128BitVector(V1V2, 0, DAG, DL);

16355

V2 = extract128BitVector(V1V2, 4, DAG, DL);

16356

} else {

16357

SmallVector<SDValue, 4> DWordClearOps(4,

16358

DAG.getConstant(0, DL, MVT::i32));

16359

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

16360

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

16361

SDValue DWordClearMask =

16362

DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

16363

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

16364

DWordClearMask);

16365

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

16366

DWordClearMask);

16367

}

16368

// Now pack things back together.

16369

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

16370

if (NumEvenDrops == 2) {

16371

Result = DAG.getBitcast(MVT::v4i32, Result);

16372

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

16373

}

16374

return Result;

16375

}

16376

16377

// When compacting odd (upper) elements, use PACKSS pre-SSE41.

16378

int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

16379

if (NumOddDrops == 1) {

16380

bool HasSSE41 = Subtarget.hasSSE41();

16381

V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16382

DAG.getBitcast(MVT::v4i32, V1),

16383

DAG.getTargetConstant(16, DL, MVT::i8));

16384

V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16385

DAG.getBitcast(MVT::v4i32, V2),

16386

DAG.getTargetConstant(16, DL, MVT::i8));

16387

return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

16388

MVT::v8i16, V1, V2);

16389

}

16390

16391

// Try to lower by permuting the inputs into an unpack instruction.

16392

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

16393

Mask, Subtarget, DAG))

16394

return Unpack;

16395

16396

// If we can't directly blend but can use PSHUFB, that will be better as it

16397

// can both shuffle and set up the inefficient blend.

16398

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

16399

bool V1InUse, V2InUse;

16400

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

16401

Zeroable, DAG, V1InUse, V2InUse);

16402

}

16403

16404

// We can always bit-blend if we have to so the fallback strategy is to

16405

// decompose into single-input permutes and blends/unpacks.

16406

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

16407

Mask, Subtarget, DAG);

16408

}

16409

16410

/// Lower 8-lane 16-bit floating point shuffles.

16411

static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16412

const APInt &Zeroable, SDValue V1, SDValue V2,

16413

const X86Subtarget &Subtarget,

16414

SelectionDAG &DAG) {

16415

assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16415, __extension__
__PRETTY_FUNCTION__));

16416

assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16416, __extension__
__PRETTY_FUNCTION__));

16417

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16417, __extension__
__PRETTY_FUNCTION__));

16418

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

16419

16420

if (Subtarget.hasFP16()) {

16421

if (NumV2Elements == 0) {

16422

// Check for being able to broadcast a single element.

16423

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

16424

Mask, Subtarget, DAG))

16425

return Broadcast;

16426

}

16427

if (NumV2Elements == 1 && Mask[0] >= 8)

16428

if (SDValue V = lowerShuffleAsElementInsertion(

16429

DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16430

return V;

16431

}

16432

16433

V1 = DAG.getBitcast(MVT::v8i16, V1);

16434

V2 = DAG.getBitcast(MVT::v8i16, V2);

16435

return DAG.getBitcast(MVT::v8f16,

16436

DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

16437

}

16438

16439

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

16440

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

16441

// the active subvector is extracted.

16442

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

16443

ArrayRef<int> Mask, SDValue V1, SDValue V2,

16444

const X86Subtarget &Subtarget,

16445

SelectionDAG &DAG) {

16446

MVT MaskVT = VT.changeTypeToInteger();

16447

SDValue MaskNode;

16448

MVT ShuffleVT = VT;

16449

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

16450

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

16451

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

16452

ShuffleVT = V1.getSimpleValueType();

16453

16454

// Adjust mask to correct indices for the second input.

16455

int NumElts = VT.getVectorNumElements();

16456

unsigned Scale = 512 / VT.getSizeInBits();

16457

SmallVector<int, 32> AdjustedMask(Mask);

16458

for (int &M : AdjustedMask)

16459

if (NumElts <= M)

16460

M += (Scale - 1) * NumElts;

16461

MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

16462

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

16463

} else {

16464

MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

16465

}

16466

16467

SDValue Result;

16468

if (V2.isUndef())

16469

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

16470

else

16471

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

16472

16473

if (VT != ShuffleVT)

16474

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

16475

16476

return Result;

16477

}

16478

16479

/// Generic lowering of v16i8 shuffles.

16480

///

16481

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

16482

/// detect any complexity reducing interleaving. If that doesn't help, it uses

16483

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

16484

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

16485

/// back together.

16486

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16487

const APInt &Zeroable, SDValue V1, SDValue V2,

16488

const X86Subtarget &Subtarget,

16489

SelectionDAG &DAG) {

16490

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16490, __extension__
__PRETTY_FUNCTION__));

16491

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16491, __extension__
__PRETTY_FUNCTION__));

16492

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16492, __extension__
__PRETTY_FUNCTION__));

16493

16494

// Try to use shift instructions.

16495

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,

16496

Zeroable, Subtarget, DAG))

16497

return Shift;

16498

16499

// Try to use byte rotation instructions.

16500

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

16501

Subtarget, DAG))

16502

return Rotate;

16503

16504

// Use dedicated pack instructions for masks that match their pattern.

16505

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

16506

Subtarget))

16507

return V;

16508

16509

// Try to use a zext lowering.

16510

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

16511

Zeroable, Subtarget, DAG))

16512

return ZExt;

16513

16514

// Try to use lower using a truncation.

16515

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16516

Subtarget, DAG))

16517

return V;

16518

16519

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16520

Subtarget, DAG))

16521

return V;

16522

16523

// See if we can use SSE4A Extraction / Insertion.

16524

if (Subtarget.hasSSE4A())

16525

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

16526

Zeroable, DAG))

16527

return V;

16528

16529

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

16530

16531

// For single-input shuffles, there are some nicer lowering tricks we can use.

16532

if (NumV2Elements == 0) {

16533

// Check for being able to broadcast a single element.

16534

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

16535

Mask, Subtarget, DAG))

16536

return Broadcast;

16537

16538

// Try to use bit rotation instructions.

16539

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

16540

Subtarget, DAG))

16541

return Rotate;

16542

16543

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16544

return V;

16545

16546

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

16547

// Notably, this handles splat and partial-splat shuffles more efficiently.

16548

// However, it only makes sense if the pre-duplication shuffle simplifies

16549

// things significantly. Currently, this means we need to be able to

16550

// express the pre-duplication shuffle as an i16 shuffle.

16551

//

16552

// FIXME: We should check for other patterns which can be widened into an

16553

// i16 shuffle as well.

16554

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

16555

for (int i = 0; i < 16; i += 2)

16556

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

16557

return false;

16558

16559

return true;

16560

};

16561

auto tryToWidenViaDuplication = [&]() -> SDValue {

16562

if (!canWidenViaDuplication(Mask))

16563

return SDValue();

16564

SmallVector<int, 4> LoInputs;

16565

copy_if(Mask, std::back_inserter(LoInputs),

16566

[](int M) { return M >= 0 && M < 8; });

16567

array_pod_sort(LoInputs.begin(), LoInputs.end());

16568

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

16569

LoInputs.end());

16570

SmallVector<int, 4> HiInputs;

16571

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

16572

array_pod_sort(HiInputs.begin(), HiInputs.end());

16573

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

16574

HiInputs.end());

16575

16576

bool TargetLo = LoInputs.size() >= HiInputs.size();

16577

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

16578

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

16579

16580

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

16581

SmallDenseMap<int, int, 8> LaneMap;

16582

for (int I : InPlaceInputs) {

16583

PreDupI16Shuffle[I/2] = I/2;

16584

LaneMap[I] = I;

16585

}

16586

int j = TargetLo ? 0 : 4, je = j + 4;

16587

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

16588

// Check if j is already a shuffle of this input. This happens when

16589

// there are two adjacent bytes after we move the low one.

16590

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

16591

// If we haven't yet mapped the input, search for a slot into which

16592

// we can map it.

16593

while (j < je && PreDupI16Shuffle[j] >= 0)

16594

++j;

16595

16596

if (j == je)

16597

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

16598

return SDValue();

16599

16600

// Map this input with the i16 shuffle.

16601

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

16602

}

16603

16604

// Update the lane map based on the mapping we ended up with.

16605

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

16606

}

16607

V1 = DAG.getBitcast(

16608

MVT::v16i8,

16609

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16610

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

16611

16612

// Unpack the bytes to form the i16s that will be shuffled into place.

16613

bool EvenInUse = false, OddInUse = false;

16614

for (int i = 0; i < 16; i += 2) {

16615

EvenInUse |= (Mask[i + 0] >= 0);

16616

OddInUse |= (Mask[i + 1] >= 0);

16617

if (EvenInUse && OddInUse)

16618

break;

16619

}

16620

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

16621

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

16622

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

16623

16624

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

16625

for (int i = 0; i < 16; ++i)

16626

if (Mask[i] >= 0) {

16627

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

16628

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16628, __extension__
__PRETTY_FUNCTION__));

16629

if (PostDupI16Shuffle[i / 2] < 0)

16630

PostDupI16Shuffle[i / 2] = MappedMask;

16631

else

16632

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16633, __extension__
__PRETTY_FUNCTION__))

16633

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16633, __extension__
__PRETTY_FUNCTION__));

16634

}

16635

return DAG.getBitcast(

16636

MVT::v16i8,

16637

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16638

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

16639

};

16640

if (SDValue V = tryToWidenViaDuplication())

16641

return V;

16642

}

16643

16644

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

16645

Zeroable, Subtarget, DAG))

16646

return Masked;

16647

16648

// Use dedicated unpack instructions for masks that match their pattern.

16649

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16650

return V;

16651

16652

// Try to use byte shift instructions to mask.

16653

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

16654

Zeroable, Subtarget, DAG))

16655

return V;

16656

16657

// Check for compaction patterns.

16658

bool IsSingleInput = V2.isUndef();

16659

int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);

16660

16661

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

16662

// with PSHUFB. It is important to do this before we attempt to generate any

16663

// blends but after all of the single-input lowerings. If the single input

16664

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

16665

// want to preserve that and we can DAG combine any longer sequences into

16666

// a PSHUFB in the end. But once we start blending from multiple inputs,

16667

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

16668

// and there are *very* few patterns that would actually be faster than the

16669

// PSHUFB approach because of its ability to zero lanes.

16670

//

16671

// If the mask is a binary compaction, we can more efficiently perform this

16672

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

16673

//

16674

// FIXME: The only exceptions to the above are blends which are exact

16675

// interleavings with direct instructions supporting them. We currently don't

16676

// handle those well here.

16677

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

16678

bool V1InUse = false;

16679

bool V2InUse = false;

16680

16681

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

16682

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

16683

16684

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

16685

// do so. This avoids using them to handle blends-with-zero which is

16686

// important as a single pshufb is significantly faster for that.

16687

if (V1InUse && V2InUse) {

16688

if (Subtarget.hasSSE41())

16689

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

16690

Zeroable, Subtarget, DAG))

16691

return Blend;

16692

16693

// We can use an unpack to do the blending rather than an or in some

16694

// cases. Even though the or may be (very minorly) more efficient, we

16695

// preference this lowering because there are common cases where part of

16696

// the complexity of the shuffles goes away when we do the final blend as

16697

// an unpack.

16698

// FIXME: It might be worth trying to detect if the unpack-feeding

16699

// shuffles will both be pshufb, in which case we shouldn't bother with

16700

// this.

16701

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

16702

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16703

return Unpack;

16704

16705

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

16706

if (Subtarget.hasVBMI())

16707

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

16708

DAG);

16709

16710

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

16711

if (Subtarget.hasXOP()) {

16712

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

16713

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

16714

}

16715

16716

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

16717

// PALIGNR will be cheaper than the second PSHUFB+OR.

16718

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

16719

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16720

return V;

16721

}

16722

16723

return PSHUFB;

16724

}

16725

16726

// There are special ways we can lower some single-element blends.

16727

if (NumV2Elements == 1)

16728

if (SDValue V = lowerShuffleAsElementInsertion(

16729

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

16730

return V;

16731

16732

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

16733

return Blend;

16734

16735

// Check whether a compaction lowering can be done. This handles shuffles

16736

// which take every Nth element for some even N. See the helper function for

16737

// details.

16738

//

16739

// We special case these as they can be particularly efficiently handled with

16740

// the PACKUSB instruction on x86 and they show up in common patterns of

16741

// rearranging bytes to truncate wide elements.

16742

if (NumEvenDrops) {

16743

// NumEvenDrops is the power of two stride of the elements. Another way of

16744

// thinking about it is that we need to drop the even elements this many

16745

// times to get the original input.

16746

16747

// First we need to zero all the dropped bytes.

16748

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16749, __extension__
__PRETTY_FUNCTION__))

16749

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16749, __extension__
__PRETTY_FUNCTION__));

16750

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

16751

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

16752

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

16753

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

16754

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

16755

WordClearMask);

16756

if (!IsSingleInput)

16757

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

16758

WordClearMask);

16759

16760

// Now pack things back together.

16761

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16762

IsSingleInput ? V1 : V2);

16763

for (int i = 1; i < NumEvenDrops; ++i) {

16764

Result = DAG.getBitcast(MVT::v8i16, Result);

16765

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

16766

}

16767

return Result;

16768

}

16769

16770

int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

16771

if (NumOddDrops == 1) {

16772

V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16773

DAG.getBitcast(MVT::v8i16, V1),

16774

DAG.getTargetConstant(8, DL, MVT::i8));

16775

if (!IsSingleInput)

16776

V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16777

DAG.getBitcast(MVT::v8i16, V2),

16778

DAG.getTargetConstant(8, DL, MVT::i8));

16779

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16780

IsSingleInput ? V1 : V2);

16781

}

16782

16783

// Handle multi-input cases by blending/unpacking single-input shuffles.

16784

if (NumV2Elements > 0)

16785

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

16786

Subtarget, DAG);

16787

16788

// The fallback path for single-input shuffles widens this into two v8i16

16789

// vectors with unpacks, shuffles those, and then pulls them back together

16790

// with a pack.

16791

SDValue V = V1;

16792

16793

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16794

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16795

for (int i = 0; i < 16; ++i)

16796

if (Mask[i] >= 0)

16797

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

16798

16799

SDValue VLoHalf, VHiHalf;

16800

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

16801

// them out and avoid using UNPCK{L,H} to extract the elements of V as

16802

// i16s.

16803

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

16804

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

16805

// Use a mask to drop the high bytes.

16806

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

16807

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

16808

DAG.getConstant(0x00FF, DL, MVT::v8i16));

16809

16810

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

16811

VHiHalf = DAG.getUNDEF(MVT::v8i16);

16812

16813

// Squash the masks to point directly into VLoHalf.

16814

for (int &M : LoBlendMask)

16815

if (M >= 0)

16816

M /= 2;

16817

for (int &M : HiBlendMask)

16818

if (M >= 0)

16819

M /= 2;

16820

} else {

16821

// Otherwise just unpack the low half of V into VLoHalf and the high half into

16822

// VHiHalf so that we can blend them as i16s.

16823

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

16824

16825

VLoHalf = DAG.getBitcast(

16826

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

16827

VHiHalf = DAG.getBitcast(

16828

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

16829

}

16830

16831

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

16832

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

16833

16834

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

16835

}

16836

16837

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

16838

///

16839

/// This routine breaks down the specific type of 128-bit shuffle and

16840

/// dispatches to the lowering routines accordingly.

16841

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

16842

MVT VT, SDValue V1, SDValue V2,

16843

const APInt &Zeroable,

16844

const X86Subtarget &Subtarget,

16845

SelectionDAG &DAG) {

16846

switch (VT.SimpleTy) {

16847

case MVT::v2i64:

16848

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16849

case MVT::v2f64:

16850

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16851

case MVT::v4i32:

16852

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16853

case MVT::v4f32:

16854

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16855

case MVT::v8i16:

16856

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16857

case MVT::v8f16:

16858

return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16859

case MVT::v16i8:

16860

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16861

16862

default:

16863

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16863);

16864

}

16865

}

16866

16867

/// Generic routine to split vector shuffle into half-sized shuffles.

16868

///

16869

/// This routine just extracts two subvectors, shuffles them independently, and

16870

/// then concatenates them back together. This should work effectively with all

16871

/// AVX vector shuffle types.

16872

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

16873

SDValue V2, ArrayRef<int> Mask,

16874

SelectionDAG &DAG) {

16875

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16876, __extension__
__PRETTY_FUNCTION__))

16876

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16876, __extension__
__PRETTY_FUNCTION__));

16877

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16877, __extension__
__PRETTY_FUNCTION__));

16878

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16878, __extension__
__PRETTY_FUNCTION__));

16879

16880

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

16881

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

16882

16883

int NumElements = VT.getVectorNumElements();

16884

int SplitNumElements = NumElements / 2;

16885

MVT ScalarVT = VT.getVectorElementType();

16886

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

16887

16888

// Use splitVector/extractSubVector so that split build-vectors just build two

16889

// narrower build vectors. This helps shuffling with splats and zeros.

16890

auto SplitVector = [&](SDValue V) {

16891

SDValue LoV, HiV;

16892

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

16893

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

16894

DAG.getBitcast(SplitVT, HiV));

16895

};

16896

16897

SDValue LoV1, HiV1, LoV2, HiV2;

16898

std::tie(LoV1, HiV1) = SplitVector(V1);

16899

std::tie(LoV2, HiV2) = SplitVector(V2);

16900

16901

// Now create two 4-way blends of these half-width vectors.

16902

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

16903

bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;

16904

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

16905

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

16906

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

16907

for (int i = 0; i < SplitNumElements; ++i) {

16908

int M = HalfMask[i];

16909

if (M >= NumElements) {

16910

if (M >= NumElements + SplitNumElements)

16911

UseHiV2 = true;

16912

else

16913

UseLoV2 = true;

16914

V2BlendMask[i] = M - NumElements;

16915

BlendMask[i] = SplitNumElements + i;

16916

} else if (M >= 0) {

16917

if (M >= SplitNumElements)

16918

UseHiV1 = true;

16919

else

16920

UseLoV1 = true;

16921

V1BlendMask[i] = M;

16922

BlendMask[i] = i;

16923

}

16924

}

16925

16926

// Because the lowering happens after all combining takes place, we need to

16927

// manually combine these blend masks as much as possible so that we create

16928

// a minimal number of high-level vector shuffle nodes.

16929

16930

// First try just blending the halves of V1 or V2.

16931

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

16932

return DAG.getUNDEF(SplitVT);

16933

if (!UseLoV2 && !UseHiV2)

16934

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16935

if (!UseLoV1 && !UseHiV1)

16936

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16937

16938

SDValue V1Blend, V2Blend;

16939

if (UseLoV1 && UseHiV1) {

16940

V1Blend =

16941

DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16942

} else {

16943

// We only use half of V1 so map the usage down into the final blend mask.

16944

V1Blend = UseLoV1 ? LoV1 : HiV1;

16945

for (int i = 0; i < SplitNumElements; ++i)

16946

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

16947

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

16948

}

16949

if (UseLoV2 && UseHiV2) {

16950

V2Blend =

16951

DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16952

} else {

16953

// We only use half of V2 so map the usage down into the final blend mask.

16954

V2Blend = UseLoV2 ? LoV2 : HiV2;

16955

for (int i = 0; i < SplitNumElements; ++i)

16956

if (BlendMask[i] >= SplitNumElements)

16957

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

16958

}

16959

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

16960

};

16961

SDValue Lo = HalfBlend(LoMask);

16962

SDValue Hi = HalfBlend(HiMask);

16963

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

16964

}

16965

16966

/// Either split a vector in halves or decompose the shuffles and the

16967

/// blend/unpack.

16968

///

16969

/// This is provided as a good fallback for many lowerings of non-single-input

16970

/// shuffles with more than one 128-bit lane. In those cases, we want to select

16971

/// between splitting the shuffle into 128-bit components and stitching those

16972

/// back together vs. extracting the single-input shuffles and blending those

16973

/// results.

16974

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

16975

SDValue V2, ArrayRef<int> Mask,

16976

const X86Subtarget &Subtarget,

16977

SelectionDAG &DAG) {

16978

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16979, __extension__
__PRETTY_FUNCTION__))

16979

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16979, __extension__
__PRETTY_FUNCTION__));

16980

int Size = Mask.size();

16981

16982

// If this can be modeled as a broadcast of two elements followed by a blend,

16983

// prefer that lowering. This is especially important because broadcasts can

16984

// often fold with memory operands.

16985

auto DoBothBroadcast = [&] {

16986

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

16987

for (int M : Mask)

16988

if (M >= Size) {

16989

if (V2BroadcastIdx < 0)

16990

V2BroadcastIdx = M - Size;

16991

else if (M - Size != V2BroadcastIdx)

16992

return false;

16993

} else if (M >= 0) {

16994

if (V1BroadcastIdx < 0)

16995

V1BroadcastIdx = M;

16996

else if (M != V1BroadcastIdx)

16997

return false;

16998

}

16999

return true;

17000

};

17001

if (DoBothBroadcast())

17002

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17003

DAG);

17004

17005

// If the inputs all stem from a single 128-bit lane of each input, then we

17006

// split them rather than blending because the split will decompose to

17007

// unusually few instructions.

17008

int LaneCount = VT.getSizeInBits() / 128;

17009

int LaneSize = Size / LaneCount;

17010

SmallBitVector LaneInputs[2];

17011

LaneInputs[0].resize(LaneCount, false);

17012

LaneInputs[1].resize(LaneCount, false);

17013

for (int i = 0; i < Size; ++i)

17014

if (Mask[i] >= 0)

17015

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

17016

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

17017

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

17018

17019

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

17020

// requires that the decomposed single-input shuffles don't end up here.

17021

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17022

DAG);

17023

}

17024

17025

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17026

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

17027

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

17028

SDValue V1, SDValue V2,

17029

ArrayRef<int> Mask,

17030

SelectionDAG &DAG) {

17031

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17031, __extension__
__PRETTY_FUNCTION__));

17032

17033

int LHSMask[4] = {-1, -1, -1, -1};

17034

int RHSMask[4] = {-1, -1, -1, -1};

17035

unsigned SHUFPMask = 0;

17036

17037

// As SHUFPD uses a single LHS/RHS element per lane, we can always

17038

// perform the shuffle once the lanes have been shuffled in place.

17039

for (int i = 0; i != 4; ++i) {

17040

int M = Mask[i];

17041

if (M < 0)

17042

continue;

17043

int LaneBase = i & ~1;

17044

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

17045

LaneMask[LaneBase + (M & 1)] = M;

17046

SHUFPMask |= (M & 1) << i;

17047

}

17048

17049

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

17050

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

17051

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

17052

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

17053

}

17054

17055

/// Lower a vector shuffle crossing multiple 128-bit lanes as

17056

/// a lane permutation followed by a per-lane permutation.

17057

///

17058

/// This is mainly for cases where we can have non-repeating permutes

17059

/// in each lane.

17060

///

17061

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

17062

/// we should investigate merging them.

17063

static SDValue lowerShuffleAsLanePermuteAndPermute(

17064

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17065

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17066

int NumElts = VT.getVectorNumElements();

17067

int NumLanes = VT.getSizeInBits() / 128;

17068

int NumEltsPerLane = NumElts / NumLanes;

17069

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

17070

17071

/// Attempts to find a sublane permute with the given size

17072

/// that gets all elements into their target lanes.

17073

///

17074

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

17075

/// If unsuccessful, returns false and may overwrite InLaneMask.

17076

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

17077

int NumSublanesPerLane = NumSublanes / NumLanes;

17078

int NumEltsPerSublane = NumElts / NumSublanes;

17079

17080

SmallVector<int, 16> CrossLaneMask;

17081

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

17082

// CrossLaneMask but one entry == one sublane.

17083

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

17084

17085

for (int i = 0; i != NumElts; ++i) {

17086

int M = Mask[i];

17087

if (M < 0)

17088

continue;

17089

17090

int SrcSublane = M / NumEltsPerSublane;

17091

int DstLane = i / NumEltsPerLane;

17092

17093

// We only need to get the elements into the right lane, not sublane.

17094

// So search all sublanes that make up the destination lane.

17095

bool Found = false;

17096

int DstSubStart = DstLane * NumSublanesPerLane;

17097

int DstSubEnd = DstSubStart + NumSublanesPerLane;

17098

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

17099

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

17100

continue;

17101

17102

Found = true;

17103

CrossLaneMaskLarge[DstSublane] = SrcSublane;

17104

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

17105

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

17106

break;

17107

}

17108

if (!Found)

17109

return SDValue();

17110

}

17111

17112

// Fill CrossLaneMask using CrossLaneMaskLarge.

17113

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

17114

17115

if (!CanUseSublanes) {

17116

// If we're only shuffling a single lowest lane and the rest are identity

17117

// then don't bother.

17118

// TODO - isShuffleMaskInputInPlace could be extended to something like

17119

// this.

17120

int NumIdentityLanes = 0;

17121

bool OnlyShuffleLowestLane = true;

17122

for (int i = 0; i != NumLanes; ++i) {

17123

int LaneOffset = i * NumEltsPerLane;

17124

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

17125

i * NumEltsPerLane))

17126

NumIdentityLanes++;

17127

else if (CrossLaneMask[LaneOffset] != 0)

17128

OnlyShuffleLowestLane = false;

17129

}

17130

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

17131

return SDValue();

17132

}

17133

17134

// Avoid returning the same shuffle operation. For example,

17135

// t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,

17136

// undef:v16i16

17137

if (CrossLaneMask == Mask || InLaneMask == Mask)

17138

return SDValue();

17139

17140

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

17141

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

17142

InLaneMask);

17143

};

17144

17145

// First attempt a solution with full lanes.

17146

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

17147

return V;

17148

17149

// The rest of the solutions use sublanes.

17150

if (!CanUseSublanes)

17151

return SDValue();

17152

17153

// Then attempt a solution with 64-bit sublanes (vpermq).

17154

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

17155

return V;

17156

17157

// If that doesn't work and we have fast variable cross-lane shuffle,

17158

// attempt 32-bit sublanes (vpermd).

17159

if (!Subtarget.hasFastVariableCrossLaneShuffle())

17160

return SDValue();

17161

17162

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

17163

}

17164

17165

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

17166

/// source with a lane permutation.

17167

///

17168

/// This lowering strategy results in four instructions in the worst case for a

17169

/// single-input cross lane shuffle which is lower than any other fully general

17170

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

17171

/// shuffle pattern should be handled prior to trying this lowering.

17172

static SDValue lowerShuffleAsLanePermuteAndShuffle(

17173

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17174

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17175

// FIXME: This should probably be generalized for 512-bit vectors as well.

17176

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17176, __extension__
__PRETTY_FUNCTION__));

17177

int Size = Mask.size();

17178

int LaneSize = Size / 2;

17179

17180

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17181

// Only do this if the elements aren't all from the lower lane,

17182

// otherwise we're (probably) better off doing a split.

17183

if (VT == MVT::v4f64 &&

17184

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

17185

return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);

17186

17187

// If there are only inputs from one 128-bit lane, splitting will in fact be

17188

// less expensive. The flags track whether the given lane contains an element

17189

// that crosses to another lane.

17190

bool AllLanes;

17191

if (!Subtarget.hasAVX2()) {

17192

bool LaneCrossing[2] = {false, false};

17193

for (int i = 0; i < Size; ++i)

17194

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

17195

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

17196

AllLanes = LaneCrossing[0] && LaneCrossing[1];

17197

} else {

17198

bool LaneUsed[2] = {false, false};

17199

for (int i = 0; i < Size; ++i)

17200

if (Mask[i] >= 0)

17201

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

17202

AllLanes = LaneUsed[0] && LaneUsed[1];

17203

}

17204

17205

// TODO - we could support shuffling V2 in the Flipped input.

17206

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17207, __extension__
__PRETTY_FUNCTION__))

17207

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17207, __extension__
__PRETTY_FUNCTION__));

17208

17209

SmallVector<int, 32> InLaneMask(Mask);

17210

for (int i = 0; i < Size; ++i) {

17211

int &M = InLaneMask[i];

17212

if (M < 0)

17213

continue;

17214

if (((M % Size) / LaneSize) != (i / LaneSize))

17215

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

17216

}

17217

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17218, __extension__
__PRETTY_FUNCTION__))

17218

"In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17218, __extension__
__PRETTY_FUNCTION__));

17219

17220

// If we're not using both lanes in each lane and the inlane mask is not

17221

// repeating, then we're better off splitting.

17222

if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

17223

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

17224

17225

// Flip the lanes, and shuffle the results which should now be in-lane.

17226

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

17227

SDValue Flipped = DAG.getBitcast(PVT, V1);

17228

Flipped =

17229

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

17230

Flipped = DAG.getBitcast(VT, Flipped);

17231

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

17232

}

17233

17234

/// Handle lowering 2-lane 128-bit shuffles.

17235

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

17236

SDValue V2, ArrayRef<int> Mask,

17237

const APInt &Zeroable,

17238

const X86Subtarget &Subtarget,

17239

SelectionDAG &DAG) {

17240

if (V2.isUndef()) {

17241

// Attempt to match VBROADCAST*128 subvector broadcast load.

17242

bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

17243

bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

17244

if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

17245

X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

17246

MVT MemVT = VT.getHalfNumVectorElementsVT();

17247

unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

17248

auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

17249

if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

17250

VT, MemVT, Ld, Ofs, DAG))

17251

return BcstLd;

17252

}

17253

17254

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

17255

if (Subtarget.hasAVX2())

17256

return SDValue();

17257

}

17258

17259

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

17260

17261

SmallVector<int, 4> WidenedMask;

17262

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

17263

return SDValue();

17264

17265

bool IsLowZero = (Zeroable & 0x3) == 0x3;

17266

bool IsHighZero = (Zeroable & 0xc) == 0xc;

17267

17268

// Try to use an insert into a zero vector.

17269

if (WidenedMask[0] == 0 && IsHighZero) {

17270

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17271

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

17272

DAG.getIntPtrConstant(0, DL));

17273

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17274

getZeroVector(VT, Subtarget, DAG, DL), LoV,

17275

DAG.getIntPtrConstant(0, DL));

17276

}

17277

17278

// TODO: If minimizing size and one of the inputs is a zero vector and the

17279

// the zero vector has only one use, we could use a VPERM2X128 to save the

17280

// instruction bytes needed to explicitly generate the zero vector.

17281

17282

// Blends are faster and handle all the non-lane-crossing cases.

17283

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

17284

Subtarget, DAG))

17285

return Blend;

17286

17287

// If either input operand is a zero vector, use VPERM2X128 because its mask

17288

// allows us to replace the zero input with an implicit zero.

17289

if (!IsLowZero && !IsHighZero) {

17290

// Check for patterns which can be matched with a single insert of a 128-bit

17291

// subvector.

17292

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

17293

if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {

17294

17295

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

17296

// this will likely become vinsertf128 which can't fold a 256-bit memop.

17297

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

17298

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17299

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

17300

OnlyUsesV1 ? V1 : V2,

17301

DAG.getIntPtrConstant(0, DL));

17302

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

17303

DAG.getIntPtrConstant(2, DL));

17304

}

17305

}

17306

17307

// Try to use SHUF128 if possible.

17308

if (Subtarget.hasVLX()) {

17309

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

17310

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

17311

((WidenedMask[1] % 2) << 1);

17312

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

17313

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17314

}

17315

}

17316

}

17317

17318

// Otherwise form a 128-bit permutation. After accounting for undefs,

17319

// convert the 64-bit shuffle mask selection values into 128-bit

17320

// selection bits by dividing the indexes by 2 and shifting into positions

17321

// defined by a vperm2*128 instruction's immediate control byte.

17322

17323

// The immediate permute control byte looks like this:

17324

// [1:0] - select 128 bits from sources for low half of destination

17325

// [2] - ignore

17326

// [3] - zero low half of destination

17327

// [5:4] - select 128 bits from sources for high half of destination

17328

// [6] - ignore

17329

// [7] - zero high half of destination

17330

17331

assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17332, __extension__
__PRETTY_FUNCTION__))

17332

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17332, __extension__
__PRETTY_FUNCTION__));

17333

17334

unsigned PermMask = 0;

17335

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

17336

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

17337

17338

// Check the immediate mask and replace unused sources with undef.

17339

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

17340

V1 = DAG.getUNDEF(VT);

17341

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

17342

V2 = DAG.getUNDEF(VT);

17343

17344

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

17345

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17346

}

17347

17348

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

17349

/// shuffling each lane.

17350

///

17351

/// This attempts to create a repeated lane shuffle where each lane uses one

17352

/// or two of the lanes of the inputs. The lanes of the input vectors are

17353

/// shuffled in one or two independent shuffles to get the lanes into the

17354

/// position needed by the final shuffle.

17355

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

17356

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17357

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17358

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17358, __extension__
__PRETTY_FUNCTION__));

17359

17360

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17361

return SDValue();

17362

17363

int NumElts = Mask.size();

17364

int NumLanes = VT.getSizeInBits() / 128;

17365

int NumLaneElts = 128 / VT.getScalarSizeInBits();

17366

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

17367

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

17368

17369

// First pass will try to fill in the RepeatMask from lanes that need two

17370

// sources.

17371

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17372

int Srcs[2] = {-1, -1};

17373

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

17374

for (int i = 0; i != NumLaneElts; ++i) {

17375

int M = Mask[(Lane * NumLaneElts) + i];

17376

if (M < 0)

17377

continue;

17378

// Determine which of the possible input lanes (NumLanes from each source)

17379

// this element comes from. Assign that as one of the sources for this

17380

// lane. We can assign up to 2 sources for this lane. If we run out

17381

// sources we can't do anything.

17382

int LaneSrc = M / NumLaneElts;

17383

int Src;

17384

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

17385

Src = 0;

17386

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

17387

Src = 1;

17388

else

17389

return SDValue();

17390

17391

Srcs[Src] = LaneSrc;

17392

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

17393

}

17394

17395

// If this lane has two sources, see if it fits with the repeat mask so far.

17396

if (Srcs[1] < 0)

17397

continue;

17398

17399

LaneSrcs[Lane][0] = Srcs[0];

17400

LaneSrcs[Lane][1] = Srcs[1];

17401

17402

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

17403

assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17403, __extension__
__PRETTY_FUNCTION__));

17404

for (int i = 0, e = M1.size(); i != e; ++i)

17405

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

17406

return false;

17407

return true;

17408

};

17409

17410

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

17411

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17411, __extension__
__PRETTY_FUNCTION__));

17412

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

17413

int M = Mask[i];

17414

if (M < 0)

17415

continue;

17416

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17417, __extension__
__PRETTY_FUNCTION__))

17417

"Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17417, __extension__
__PRETTY_FUNCTION__));

17418

MergedMask[i] = M;

17419

}

17420

};

17421

17422

if (MatchMasks(InLaneMask, RepeatMask)) {

17423

// Merge this lane mask into the final repeat mask.

17424

MergeMasks(InLaneMask, RepeatMask);

17425

continue;

17426

}

17427

17428

// Didn't find a match. Swap the operands and try again.

17429

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

17430

ShuffleVectorSDNode::commuteMask(InLaneMask);

17431

17432

if (MatchMasks(InLaneMask, RepeatMask)) {

17433

// Merge this lane mask into the final repeat mask.

17434

MergeMasks(InLaneMask, RepeatMask);

17435

continue;

17436

}

17437

17438

// Couldn't find a match with the operands in either order.

17439

return SDValue();

17440

}

17441

17442

// Now handle any lanes with only one source.

17443

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17444

// If this lane has already been processed, skip it.

17445

if (LaneSrcs[Lane][0] >= 0)

17446

continue;

17447

17448

for (int i = 0; i != NumLaneElts; ++i) {

17449

int M = Mask[(Lane * NumLaneElts) + i];

17450

if (M < 0)

17451

continue;

17452

17453

// If RepeatMask isn't defined yet we can define it ourself.

17454

if (RepeatMask[i] < 0)

17455

RepeatMask[i] = M % NumLaneElts;

17456

17457

if (RepeatMask[i] < NumElts) {

17458

if (RepeatMask[i] != M % NumLaneElts)

17459

return SDValue();

17460

LaneSrcs[Lane][0] = M / NumLaneElts;

17461

} else {

17462

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

17463

return SDValue();

17464

LaneSrcs[Lane][1] = M / NumLaneElts;

17465

}

17466

}

17467

17468

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

17469

return SDValue();

17470

}

17471

17472

SmallVector<int, 16> NewMask(NumElts, -1);

17473

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17474

int Src = LaneSrcs[Lane][0];

17475

for (int i = 0; i != NumLaneElts; ++i) {

17476

int M = -1;

17477

if (Src >= 0)

17478

M = Src * NumLaneElts + i;

17479

NewMask[Lane * NumLaneElts + i] = M;

17480

}

17481

}

17482

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17483

// Ensure we didn't get back the shuffle we started with.

17484

// FIXME: This is a hack to make up for some splat handling code in

17485

// getVectorShuffle.

17486

if (isa<ShuffleVectorSDNode>(NewV1) &&

17487

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

17488

return SDValue();

17489

17490

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17491

int Src = LaneSrcs[Lane][1];

17492

for (int i = 0; i != NumLaneElts; ++i) {

17493

int M = -1;

17494

if (Src >= 0)

17495

M = Src * NumLaneElts + i;

17496

NewMask[Lane * NumLaneElts + i] = M;

17497

}

17498

}

17499

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17500

// Ensure we didn't get back the shuffle we started with.

17501

// FIXME: This is a hack to make up for some splat handling code in

17502

// getVectorShuffle.

17503

if (isa<ShuffleVectorSDNode>(NewV2) &&

17504

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

17505

return SDValue();

17506

17507

for (int i = 0; i != NumElts; ++i) {

17508

if (Mask[i] < 0) {

17509

NewMask[i] = -1;

17510

continue;

17511

}

17512

NewMask[i] = RepeatMask[i % NumLaneElts];

17513

if (NewMask[i] < 0)

17514

continue;

17515

17516

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

17517

}

17518

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

17519

}

17520

17521

/// If the input shuffle mask results in a vector that is undefined in all upper

17522

/// or lower half elements and that mask accesses only 2 halves of the

17523

/// shuffle's operands, return true. A mask of half the width with mask indexes

17524

/// adjusted to access the extracted halves of the original shuffle operands is

17525

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

17526

/// lower half of each input operand is accessed.

17527

static bool

17528

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

17529

int &HalfIdx1, int &HalfIdx2) {

17530

assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17531, __extension__
__PRETTY_FUNCTION__))

17531

"Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17531, __extension__
__PRETTY_FUNCTION__));

17532

17533

// Exactly one half of the result must be undef to allow narrowing.

17534

bool UndefLower = isUndefLowerHalf(Mask);

17535

bool UndefUpper = isUndefUpperHalf(Mask);

17536

if (UndefLower == UndefUpper)

17537

return false;

17538

17539

unsigned HalfNumElts = HalfMask.size();

17540

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

17541

HalfIdx1 = -1;

17542

HalfIdx2 = -1;

17543

for (unsigned i = 0; i != HalfNumElts; ++i) {

17544

int M = Mask[i + MaskIndexOffset];

17545

if (M < 0) {

17546

HalfMask[i] = M;

17547

continue;

17548

}

17549

17550

// Determine which of the 4 half vectors this element is from.

17551

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

17552

int HalfIdx = M / HalfNumElts;

17553

17554

// Determine the element index into its half vector source.

17555

int HalfElt = M % HalfNumElts;

17556

17557

// We can shuffle with up to 2 half vectors, set the new 'half'

17558

// shuffle mask accordingly.

17559

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

17560

HalfMask[i] = HalfElt;

17561

HalfIdx1 = HalfIdx;

17562

continue;

17563

}

17564

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

17565

HalfMask[i] = HalfElt + HalfNumElts;

17566

HalfIdx2 = HalfIdx;

17567

continue;

17568

}

17569

17570

// Too many half vectors referenced.

17571

return false;

17572

}

17573

17574

return true;

17575

}

17576

17577

/// Given the output values from getHalfShuffleMask(), create a half width

17578

/// shuffle of extracted vectors followed by an insert back to full width.

17579

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

17580

ArrayRef<int> HalfMask, int HalfIdx1,

17581

int HalfIdx2, bool UndefLower,

17582

SelectionDAG &DAG, bool UseConcat = false) {

17583

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17583, __extension__
__PRETTY_FUNCTION__));

17584

assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17584, __extension__
__PRETTY_FUNCTION__));

17585

17586

MVT VT = V1.getSimpleValueType();

17587

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17588

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17589

17590

auto getHalfVector = [&](int HalfIdx) {

17591

if (HalfIdx < 0)

17592

return DAG.getUNDEF(HalfVT);

17593

SDValue V = (HalfIdx < 2 ? V1 : V2);

17594

HalfIdx = (HalfIdx % 2) * HalfNumElts;

17595

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

17596

DAG.getIntPtrConstant(HalfIdx, DL));

17597

};

17598

17599

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

17600

SDValue Half1 = getHalfVector(HalfIdx1);

17601

SDValue Half2 = getHalfVector(HalfIdx2);

17602

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

17603

if (UseConcat) {

17604

SDValue Op0 = V;

17605

SDValue Op1 = DAG.getUNDEF(HalfVT);

17606

if (UndefLower)

17607

std::swap(Op0, Op1);

17608

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

17609

}

17610

17611

unsigned Offset = UndefLower ? HalfNumElts : 0;

17612

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

17613

DAG.getIntPtrConstant(Offset, DL));

17614

}

17615

17616

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

17617

/// This allows for fast cases such as subvector extraction/insertion

17618

/// or shuffling smaller vector types which can lower more efficiently.

17619

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

17620

SDValue V2, ArrayRef<int> Mask,

17621

const X86Subtarget &Subtarget,

17622

SelectionDAG &DAG) {

17623

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17624, __extension__
__PRETTY_FUNCTION__))

17624

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17624, __extension__
__PRETTY_FUNCTION__));

17625

17626

bool UndefLower = isUndefLowerHalf(Mask);

17627

if (!UndefLower && !isUndefUpperHalf(Mask))

17628

return SDValue();

17629

17630

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17631, __extension__
__PRETTY_FUNCTION__))

17631

"Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17631, __extension__
__PRETTY_FUNCTION__));

17632

17633

// Upper half is undef and lower half is whole upper subvector.

17634

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

17635

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17636

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17637

if (!UndefLower &&

17638

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

17639

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17640

DAG.getIntPtrConstant(HalfNumElts, DL));

17641

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17642

DAG.getIntPtrConstant(0, DL));

17643

}

17644

17645

// Lower half is undef and upper half is whole lower subvector.

17646

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

17647

if (UndefLower &&

17648

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

17649

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17650

DAG.getIntPtrConstant(0, DL));

17651

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17652

DAG.getIntPtrConstant(HalfNumElts, DL));

17653

}

17654

17655

int HalfIdx1, HalfIdx2;

17656

SmallVector<int, 8> HalfMask(HalfNumElts);

17657

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

17658

return SDValue();

17659

17660

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17660, __extension__
__PRETTY_FUNCTION__));

17661

17662

// Only shuffle the halves of the inputs when useful.

17663

unsigned NumLowerHalves =

17664

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

17665

unsigned NumUpperHalves =

17666

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

17667

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17667, __extension__
__PRETTY_FUNCTION__));

17668

17669

// Determine the larger pattern of undef/halves, then decide if it's worth

17670

// splitting the shuffle based on subtarget capabilities and types.

17671

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

17672

if (!UndefLower) {

17673

// XXXXuuuu: no insert is needed.

17674

// Always extract lowers when setting lower - these are all free subreg ops.

17675

if (NumUpperHalves == 0)

17676

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17677

UndefLower, DAG);

17678

17679

if (NumUpperHalves == 1) {

17680

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

17681

if (Subtarget.hasAVX2()) {

17682

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

17683

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

17684

!is128BitUnpackShuffleMask(HalfMask, DAG) &&

17685

(!isSingleSHUFPSMask(HalfMask) ||

17686

Subtarget.hasFastVariableCrossLaneShuffle()))

17687

return SDValue();

17688

// If this is a unary shuffle (assume that the 2nd operand is

17689

// canonicalized to undef), then we can use vpermpd. Otherwise, we

17690

// are better off extracting the upper half of 1 operand and using a

17691

// narrow shuffle.

17692

if (EltWidth == 64 && V2.isUndef())

17693

return SDValue();

17694

}

17695

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17696

if (Subtarget.hasAVX512() && VT.is512BitVector())

17697

return SDValue();

17698

// Extract + narrow shuffle is better than the wide alternative.

17699

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17700

UndefLower, DAG);

17701

}

17702

17703

// Don't extract both uppers, instead shuffle and then extract.

17704

assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17704, __extension__
__PRETTY_FUNCTION__));

17705

return SDValue();

17706

}

17707

17708

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

17709

if (NumUpperHalves == 0) {

17710

// AVX2 has efficient 64-bit element cross-lane shuffles.

17711

// TODO: Refine to account for unary shuffle, splat, and other masks?

17712

if (Subtarget.hasAVX2() && EltWidth == 64)

17713

return SDValue();

17714

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17715

if (Subtarget.hasAVX512() && VT.is512BitVector())

17716

return SDValue();

17717

// Narrow shuffle + insert is better than the wide alternative.

17718

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17719

UndefLower, DAG);

17720

}

17721

17722

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

17723

return SDValue();

17724

}

17725

17726

/// Handle case where shuffle sources are coming from the same 128-bit lane and

17727

/// every lane can be represented as the same repeating mask - allowing us to

17728

/// shuffle the sources with the repeating shuffle and then permute the result

17729

/// to the destination lanes.

17730

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

17731

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17732

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17733

int NumElts = VT.getVectorNumElements();

17734

int NumLanes = VT.getSizeInBits() / 128;

17735

int NumLaneElts = NumElts / NumLanes;

17736

17737

// On AVX2 we may be able to just shuffle the lowest elements and then

17738

// broadcast the result.

17739

if (Subtarget.hasAVX2()) {

17740

for (unsigned BroadcastSize : {16, 32, 64}) {

17741

if (BroadcastSize <= VT.getScalarSizeInBits())

17742

continue;

17743

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

17744

17745

// Attempt to match a repeating pattern every NumBroadcastElts,

17746

// accounting for UNDEFs but only references the lowest 128-bit

17747

// lane of the inputs.

17748

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

17749

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17750

for (int j = 0; j != NumBroadcastElts; ++j) {

17751

int M = Mask[i + j];

17752

if (M < 0)

17753

continue;

17754

int &R = RepeatMask[j];

17755

if (0 != ((M % NumElts) / NumLaneElts))

17756

return false;

17757

if (0 <= R && R != M)

17758

return false;

17759

R = M;

17760

}

17761

return true;

17762

};

17763

17764

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

17765

if (!FindRepeatingBroadcastMask(RepeatMask))

17766

continue;

17767

17768

// Shuffle the (lowest) repeated elements in place for broadcast.

17769

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

17770

17771

// Shuffle the actual broadcast.

17772

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

17773

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17774

for (int j = 0; j != NumBroadcastElts; ++j)

17775

BroadcastMask[i + j] = j;

17776

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

17777

BroadcastMask);

17778

}

17779

}

17780

17781

// Bail if the shuffle mask doesn't cross 128-bit lanes.

17782

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

17783

return SDValue();

17784

17785

// Bail if we already have a repeated lane shuffle mask.

17786

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17787

return SDValue();

17788

17789

// Helper to look for repeated mask in each split sublane, and that those

17790

// sublanes can then be permuted into place.

17791

auto ShuffleSubLanes = [&](int SubLaneScale) {

17792

int NumSubLanes = NumLanes * SubLaneScale;

17793

int NumSubLaneElts = NumLaneElts / SubLaneScale;

17794

17795

// Check that all the sources are coming from the same lane and see if we

17796

// can form a repeating shuffle mask (local to each sub-lane). At the same

17797

// time, determine the source sub-lane for each destination sub-lane.

17798

int TopSrcSubLane = -1;

17799

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

17800

SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

17801

SubLaneScale,

17802

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));

17803

17804

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

17805

// Extract the sub-lane mask, check that it all comes from the same lane

17806

// and normalize the mask entries to come from the first lane.

17807

int SrcLane = -1;

17808

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

17809

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17810

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

17811

if (M < 0)

17812

continue;

17813

int Lane = (M % NumElts) / NumLaneElts;

17814

if ((0 <= SrcLane) && (SrcLane != Lane))

17815

return SDValue();

17816

SrcLane = Lane;

17817

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

17818

SubLaneMask[Elt] = LocalM;

17819

}

17820

17821

// Whole sub-lane is UNDEF.

17822

if (SrcLane < 0)

17823

continue;

17824

17825

// Attempt to match against the candidate repeated sub-lane masks.

17826

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

17827

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

17828

for (int i = 0; i != NumSubLaneElts; ++i) {

17829

if (M1[i] < 0 || M2[i] < 0)

17830

continue;

17831

if (M1[i] != M2[i])

17832

return false;

17833

}

17834

return true;

17835

};

17836

17837

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

17838

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

17839

continue;

17840

17841

// Merge the sub-lane mask into the matching repeated sub-lane mask.

17842

for (int i = 0; i != NumSubLaneElts; ++i) {

17843

int M = SubLaneMask[i];

17844

if (M < 0)

17845

continue;

17846

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17847, __extension__
__PRETTY_FUNCTION__))

17847

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17847, __extension__
__PRETTY_FUNCTION__));

17848

RepeatedSubLaneMask[i] = M;

17849

}

17850

17851

// Track the top most source sub-lane - by setting the remaining to

17852

// UNDEF we can greatly simplify shuffle matching.

17853

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

17854

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

17855

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

17856

break;

17857

}

17858

17859

// Bail if we failed to find a matching repeated sub-lane mask.

17860

if (Dst2SrcSubLanes[DstSubLane] < 0)

17861

return SDValue();

17862

}

17863

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17864, __extension__
__PRETTY_FUNCTION__))

17864

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17864, __extension__
__PRETTY_FUNCTION__));

17865

17866

// Create a repeating shuffle mask for the entire vector.

17867

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

17868

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

17869

int Lane = SubLane / SubLaneScale;

17870

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

17871

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17872

int M = RepeatedSubLaneMask[Elt];

17873

if (M < 0)

17874

continue;

17875

int Idx = (SubLane * NumSubLaneElts) + Elt;

17876

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

17877

}

17878

}

17879

17880

// Shuffle each source sub-lane to its destination.

17881

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

17882

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

17883

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

17884

if (SrcSubLane < 0)

17885

continue;

17886

for (int j = 0; j != NumSubLaneElts; ++j)

17887

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

17888

}

17889

17890

// Avoid returning the same shuffle operation.

17891

// v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32

17892

if (RepeatedMask == Mask || SubLaneMask == Mask)

17893

return SDValue();

17894

17895

SDValue RepeatedShuffle =

17896

DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

17897

17898

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

17899

SubLaneMask);

17900

};

17901

17902

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

17903

// (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,

17904

// even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.

17905

// Otherwise we can only permute whole 128-bit lanes.

17906

int MinSubLaneScale = 1, MaxSubLaneScale = 1;

17907

if (Subtarget.hasAVX2() && VT.is256BitVector()) {

17908

bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);

17909

MinSubLaneScale = 2;

17910

MaxSubLaneScale =

17911

(!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;

17912

}

17913

if (Subtarget.hasBWI() && VT == MVT::v64i8)

17914

MinSubLaneScale = MaxSubLaneScale = 4;

17915

17916

for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)

17917

if (SDValue Shuffle = ShuffleSubLanes(Scale))

17918

return Shuffle;

17919

17920

return SDValue();

17921

}

17922

17923

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

17924

bool &ForceV1Zero, bool &ForceV2Zero,

17925

unsigned &ShuffleImm, ArrayRef<int> Mask,

17926

const APInt &Zeroable) {

17927

int NumElts = VT.getVectorNumElements();

17928

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17930, __extension__
__PRETTY_FUNCTION__))

17929

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17930, __extension__
__PRETTY_FUNCTION__))

17930

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17930, __extension__
__PRETTY_FUNCTION__));

17931

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17932, __extension__
__PRETTY_FUNCTION__))

17932

"Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17932, __extension__
__PRETTY_FUNCTION__));

17933

17934

bool ZeroLane[2] = { true, true };

17935

for (int i = 0; i < NumElts; ++i)

17936

ZeroLane[i & 1] &= Zeroable[i];

17937

17938

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

17939

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

17940

ShuffleImm = 0;

17941

bool ShufpdMask = true;

17942

bool CommutableMask = true;

17943

for (int i = 0; i < NumElts; ++i) {

17944

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

17945

continue;

17946

if (Mask[i] < 0)

17947

return false;

17948

int Val = (i & 6) + NumElts * (i & 1);

17949

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

17950

if (Mask[i] < Val || Mask[i] > Val + 1)

17951

ShufpdMask = false;

17952

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

17953

CommutableMask = false;

17954

ShuffleImm |= (Mask[i] % 2) << i;

17955

}

17956

17957

if (!ShufpdMask && !CommutableMask)

17958

return false;

17959

17960

if (!ShufpdMask && CommutableMask)

17961

std::swap(V1, V2);

17962

17963

ForceV1Zero = ZeroLane[0];

17964

ForceV2Zero = ZeroLane[1];

17965

return true;

17966

}

17967

17968

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

17969

SDValue V2, ArrayRef<int> Mask,

17970

const APInt &Zeroable,

17971

const X86Subtarget &Subtarget,

17972

SelectionDAG &DAG) {

17973

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17974, __extension__
__PRETTY_FUNCTION__))

17974

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17974, __extension__
__PRETTY_FUNCTION__));

17975

17976

unsigned Immediate = 0;

17977

bool ForceV1Zero = false, ForceV2Zero = false;

17978

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

17979

Mask, Zeroable))

17980

return SDValue();

17981

17982

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

17983

if (ForceV1Zero)

17984

V1 = getZeroVector(VT, Subtarget, DAG, DL);

17985

if (ForceV2Zero)

17986

V2 = getZeroVector(VT, Subtarget, DAG, DL);

17987

17988

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

17989

DAG.getTargetConstant(Immediate, DL, MVT::i8));

17990

}

17991

17992

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

17993

// by zeroable elements in the remaining 24 elements. Turn this into two

17994

// vmovqb instructions shuffled together.

17995

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

17996

SDValue V1, SDValue V2,

17997

ArrayRef<int> Mask,

17998

const APInt &Zeroable,

17999

SelectionDAG &DAG) {

18000

assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18000, __extension__
__PRETTY_FUNCTION__));

18001

18002

// The first 8 indices should be every 8th element.

18003

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

18004

return SDValue();

18005

18006

// Remaining elements need to be zeroable.

18007

if (Zeroable.countl_one() < (Mask.size() - 8))

18008

return SDValue();

18009

18010

V1 = DAG.getBitcast(MVT::v4i64, V1);

18011

V2 = DAG.getBitcast(MVT::v4i64, V2);

18012

18013

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

18014

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

18015

18016

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

18017

// the upper bits of the result using an unpckldq.

18018

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

18019

{ 0, 1, 2, 3, 16, 17, 18, 19,

18020

4, 5, 6, 7, 20, 21, 22, 23 });

18021

// Insert the unpckldq into a zero vector to widen to v32i8.

18022

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

18023

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

18024

DAG.getIntPtrConstant(0, DL));

18025

}

18026

18027

// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2

18028

// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2

18029

// =>

18030

// ul = unpckl v1, v2

18031

// uh = unpckh v1, v2

18032

// a = vperm ul, uh

18033

// b = vperm ul, uh

18034

//

18035

// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck

18036

// and permute. We cannot directly match v3 because it is split into two

18037

// 256-bit vectors in earlier isel stages. Therefore, this function matches a

18038

// pair of 256-bit shuffles and makes sure the masks are consecutive.

18039

//

18040

// Once unpck and permute nodes are created, the permute corresponding to this

18041

// shuffle is returned, while the other permute replaces the other half of the

18042

// shuffle in the selection dag.

18043

static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

18044

SDValue V1, SDValue V2,

18045

ArrayRef<int> Mask,

18046

SelectionDAG &DAG) {

18047

if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&

18048

VT != MVT::v32i8)

18049

return SDValue();

18050

// <B0, B1, B0+1, B1+1, ..., >

18051

auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,

18052

unsigned Begin1) {

18053

size_t Size = Mask.size();

18054

assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18054, __extension__
__PRETTY_FUNCTION__));

18055

for (unsigned I = 0; I < Size; I += 2) {

18056

if (Mask[I] != (int)(Begin0 + I / 2) ||

18057

Mask[I + 1] != (int)(Begin1 + I / 2))

18058

return false;

18059

}

18060

return true;

18061

};

18062

// Check which half is this shuffle node

18063

int NumElts = VT.getVectorNumElements();

18064

size_t FirstQtr = NumElts / 2;

18065

size_t ThirdQtr = NumElts + NumElts / 2;

18066

bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);

18067

bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);

18068

if (!IsFirstHalf && !IsSecondHalf)

18069

return SDValue();

18070

18071

// Find the intersection between shuffle users of V1 and V2.

18072

SmallVector<SDNode *, 2> Shuffles;

18073

for (SDNode *User : V1->uses())

18074

if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&

18075

User->getOperand(1) == V2)

18076

Shuffles.push_back(User);

18077

// Limit user size to two for now.

18078

if (Shuffles.size() != 2)

18079

return SDValue();

18080

// Find out which half of the 512-bit shuffles is each smaller shuffle

18081

auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);

18082

auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);

18083

SDNode *FirstHalf;

18084

SDNode *SecondHalf;

18085

if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&

18086

IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {

18087

FirstHalf = Shuffles[0];

18088

SecondHalf = Shuffles[1];

18089

} else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&

18090

IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {

18091

FirstHalf = Shuffles[1];

18092

SecondHalf = Shuffles[0];

18093

} else {

18094

return SDValue();

18095

}

18096

// Lower into unpck and perm. Return the perm of this shuffle and replace

18097

// the other.

18098

SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

18099

SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

18100

SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18101

DAG.getTargetConstant(0x20, DL, MVT::i8));

18102

SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18103

DAG.getTargetConstant(0x31, DL, MVT::i8));

18104

if (IsFirstHalf) {

18105

DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);

18106

return Perm1;

18107

}

18108

DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);

18109

return Perm2;

18110

}

18111

18112

/// Handle lowering of 4-lane 64-bit floating point shuffles.

18113

///

18114

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

18115

/// isn't available.

18116

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18117

const APInt &Zeroable, SDValue V1, SDValue V2,

18118

const X86Subtarget &Subtarget,

18119

SelectionDAG &DAG) {

18120

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18120, __extension__
__PRETTY_FUNCTION__));

18121

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18121, __extension__
__PRETTY_FUNCTION__));

18122

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18122, __extension__
__PRETTY_FUNCTION__));

18123

18124

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

18125

Subtarget, DAG))

18126

return V;

18127

18128

if (V2.isUndef()) {

18129

// Check for being able to broadcast a single element.

18130

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

18131

Mask, Subtarget, DAG))

18132

return Broadcast;

18133

18134

// Use low duplicate instructions for masks that match their pattern.

18135

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

18136

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

18137

18138

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

18139

// Non-half-crossing single input shuffles can be lowered with an

18140

// interleaved permutation.

18141

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18142

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

18143

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

18144

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18145

}

18146

18147

// With AVX2 we have direct support for this permutation.

18148

if (Subtarget.hasAVX2())

18149

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

18150

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18151

18152

// Try to create an in-lane repeating shuffle mask and then shuffle the

18153

// results into the target lanes.

18154

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18155

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18156

return V;

18157

18158

// Try to permute the lanes and then use a per-lane permute.

18159

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

18160

Mask, DAG, Subtarget))

18161

return V;

18162

18163

// Otherwise, fall back.

18164

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

18165

DAG, Subtarget);

18166

}

18167

18168

// Use dedicated unpack instructions for masks that match their pattern.

18169

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

18170

return V;

18171

18172

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

18173

Zeroable, Subtarget, DAG))

18174

return Blend;

18175

18176

// Check if the blend happens to exactly fit that of SHUFPD.

18177

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

18178

Zeroable, Subtarget, DAG))

18179

return Op;

18180

18181

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18182

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18183

18184

// If we have lane crossing shuffles AND they don't all come from the lower

18185

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

18186

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

18187

// canonicalize to a blend of splat which isn't necessary for this combine.

18188

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

18189

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

18190

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

18191

(V2.getOpcode() != ISD::BUILD_VECTOR))

18192

return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);

18193

18194

// If we have one input in place, then we can permute the other input and

18195

// blend the result.

18196

if (V1IsInPlace || V2IsInPlace)

18197

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18198

Subtarget, DAG);

18199

18200

// Try to create an in-lane repeating shuffle mask and then shuffle the

18201

// results into the target lanes.

18202

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18203

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18204

return V;

18205

18206

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18207

// shuffle. However, if we have AVX2 and either inputs are already in place,

18208

// we will be able to shuffle even across lanes the other input in a single

18209

// instruction so skip this pattern.

18210

if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))

18211

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

18212

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18213

return V;

18214

18215

// If we have VLX support, we can use VEXPAND.

18216

if (Subtarget.hasVLX())

18217

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

18218

DAG, Subtarget))

18219

return V;

18220

18221

// If we have AVX2 then we always want to lower with a blend because an v4 we

18222

// can fully permute the elements.

18223

if (Subtarget.hasAVX2())

18224

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18225

Subtarget, DAG);

18226

18227

// Otherwise fall back on generic lowering.

18228

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

18229

Subtarget, DAG);

18230

}

18231

18232

/// Handle lowering of 4-lane 64-bit integer shuffles.

18233

///

18234

/// This routine is only called when we have AVX2 and thus a reasonable

18235

/// instruction set for v4i64 shuffling..

18236

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18237

const APInt &Zeroable, SDValue V1, SDValue V2,

18238

const X86Subtarget &Subtarget,

18239

SelectionDAG &DAG) {

18240

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18240, __extension__
__PRETTY_FUNCTION__));

18241

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18241, __extension__
__PRETTY_FUNCTION__));

18242

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18242, __extension__
__PRETTY_FUNCTION__));

18243

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18243, __extension__
__PRETTY_FUNCTION__));

18244

18245

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18246

Subtarget, DAG))

18247

return V;

18248

18249

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

18250

Zeroable, Subtarget, DAG))

18251

return Blend;

18252

18253

// Check for being able to broadcast a single element.

18254

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

18255

Subtarget, DAG))

18256

return Broadcast;

18257

18258

if (V2.isUndef()) {

18259

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

18260

// can use lower latency instructions that will operate on both lanes.

18261

SmallVector<int, 2> RepeatedMask;

18262

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

18263

SmallVector<int, 4> PSHUFDMask;

18264

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

18265

return DAG.getBitcast(

18266

MVT::v4i64,

18267

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

18268

DAG.getBitcast(MVT::v8i32, V1),

18269

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

18270

}

18271

18272

// AVX2 provides a direct instruction for permuting a single input across

18273

// lanes.

18274

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

18275

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18276

}

18277

18278

// Try to use shift instructions.

18279

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,

18280

Zeroable, Subtarget, DAG))

18281

return Shift;

18282

18283

// If we have VLX support, we can use VALIGN or VEXPAND.

18284

if (Subtarget.hasVLX()) {

18285

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

18286

Subtarget, DAG))

18287

return Rotate;

18288

18289

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

18290

DAG, Subtarget))

18291

return V;

18292

}

18293

18294

// Try to use PALIGNR.

18295

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

18296

Subtarget, DAG))

18297

return Rotate;

18298

18299

// Use dedicated unpack instructions for masks that match their pattern.

18300

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

18301

return V;

18302

18303

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18304

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18305

18306

// If we have one input in place, then we can permute the other input and

18307

// blend the result.

18308

if (V1IsInPlace || V2IsInPlace)

18309

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18310

Subtarget, DAG);

18311

18312

// Try to create an in-lane repeating shuffle mask and then shuffle the

18313

// results into the target lanes.

18314

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18315

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18316

return V;

18317

18318

// Try to lower to PERMQ(BLENDD(V1,V2)).

18319

if (SDValue V =

18320

lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))

18321

return V;

18322

18323

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18324

// shuffle. However, if we have AVX2 and either inputs are already in place,

18325

// we will be able to shuffle even across lanes the other input in a single

18326

// instruction so skip this pattern.

18327

if (!V1IsInPlace && !V2IsInPlace)

18328

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18329

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18330

return Result;

18331

18332

// Otherwise fall back on generic blend lowering.

18333

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18334

Subtarget, DAG);

18335

}

18336

18337

/// Handle lowering of 8-lane 32-bit floating point shuffles.

18338

///

18339

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

18340

/// isn't available.

18341

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18342

const APInt &Zeroable, SDValue V1, SDValue V2,

18343

const X86Subtarget &Subtarget,

18344

SelectionDAG &DAG) {

18345

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18345, __extension__
__PRETTY_FUNCTION__));

18346

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18346, __extension__
__PRETTY_FUNCTION__));

18347

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18347, __extension__
__PRETTY_FUNCTION__));

18348

18349

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

18350

Zeroable, Subtarget, DAG))

18351

return Blend;

18352

18353

// Check for being able to broadcast a single element.

18354

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

18355

Subtarget, DAG))

18356

return Broadcast;

18357

18358

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18359

// options to efficiently lower the shuffle.

18360

SmallVector<int, 4> RepeatedMask;

18361

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

18362

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18363, __extension__
__PRETTY_FUNCTION__))

18363

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18363, __extension__
__PRETTY_FUNCTION__));

18364

18365

// Use even/odd duplicate instructions for masks that match their pattern.

18366

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18367

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

18368

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18369

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

18370

18371

if (V2.isUndef())

18372

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

18373

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18374

18375

// Use dedicated unpack instructions for masks that match their pattern.

18376

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

18377

return V;

18378

18379

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

18380

// have already handled any direct blends.

18381

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

18382

}

18383

18384

// Try to create an in-lane repeating shuffle mask and then shuffle the

18385

// results into the target lanes.

18386

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18387

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18388

return V;

18389

18390

// If we have a single input shuffle with different shuffle patterns in the

18391

// two 128-bit lanes use the variable mask to VPERMILPS.

18392

if (V2.isUndef()) {

18393

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

18394

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18395

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

18396

}

18397

if (Subtarget.hasAVX2()) {

18398

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18399

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

18400

}

18401

// Otherwise, fall back.

18402

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

18403

DAG, Subtarget);

18404

}

18405

18406

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18407

// shuffle.

18408

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18409

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18410

return Result;

18411

18412

// If we have VLX support, we can use VEXPAND.

18413

if (Subtarget.hasVLX())

18414

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

18415

DAG, Subtarget))

18416

return V;

18417

18418

// Try to match an interleave of two v8f32s and lower them as unpck and

18419

// permutes using ymms. This needs to go before we try to split the vectors.

18420

//

18421

// TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits

18422

// this path inadvertently.

18423

if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())

18424

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,

18425

Mask, DAG))

18426

return V;

18427

18428

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18429

// since after split we get a more efficient code using vpunpcklwd and

18430

// vpunpckhwd instrs than vblend.

18431

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))

18432

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

18433

DAG);

18434

18435

// If we have AVX2 then we always want to lower with a blend because at v8 we

18436

// can fully permute the elements.

18437

if (Subtarget.hasAVX2())

18438

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

18439

Subtarget, DAG);

18440

18441

// Otherwise fall back on generic lowering.

18442

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

18443

Subtarget, DAG);

18444

}

18445

18446

/// Handle lowering of 8-lane 32-bit integer shuffles.

18447

///

18448

/// This routine is only called when we have AVX2 and thus a reasonable

18449

/// instruction set for v8i32 shuffling..

18450

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18451

const APInt &Zeroable, SDValue V1, SDValue V2,

18452

const X86Subtarget &Subtarget,

18453

SelectionDAG &DAG) {

18454

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18454, __extension__
__PRETTY_FUNCTION__));

18455

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18455, __extension__
__PRETTY_FUNCTION__));

18456

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18456, __extension__
__PRETTY_FUNCTION__));

18457

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18457, __extension__
__PRETTY_FUNCTION__));

18458

18459

// Whenever we can lower this as a zext, that instruction is strictly faster

18460

// than any alternative. It also allows us to fold memory operands into the

18461

// shuffle in many cases.

18462

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18463

Zeroable, Subtarget, DAG))

18464

return ZExt;

18465

18466

// Try to match an interleave of two v8i32s and lower them as unpck and

18467

// permutes using ymms. This needs to go before we try to split the vectors.

18468

if (!Subtarget.hasAVX512())

18469

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,

18470

Mask, DAG))

18471

return V;

18472

18473

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18474

// since after split we get a more efficient code than vblend by using

18475

// vpunpcklwd and vpunpckhwd instrs.

18476

if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&

18477

!Subtarget.hasAVX512())

18478

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

18479

DAG);

18480

18481

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

18482

Zeroable, Subtarget, DAG))

18483

return Blend;

18484

18485

// Check for being able to broadcast a single element.

18486

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

18487

Subtarget, DAG))

18488

return Broadcast;

18489

18490

// If the shuffle mask is repeated in each 128-bit lane we can use more

18491

// efficient instructions that mirror the shuffles across the two 128-bit

18492

// lanes.

18493

SmallVector<int, 4> RepeatedMask;

18494

bool Is128BitLaneRepeatedShuffle =

18495

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

18496

if (Is128BitLaneRepeatedShuffle) {

18497

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18497, __extension__
__PRETTY_FUNCTION__));

18498

if (V2.isUndef())

18499

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

18500

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18501

18502

// Use dedicated unpack instructions for masks that match their pattern.

18503

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

18504

return V;

18505

}

18506

18507

// Try to use shift instructions.

18508

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,

18509

Zeroable, Subtarget, DAG))

18510

return Shift;

18511

18512

// If we have VLX support, we can use VALIGN or EXPAND.

18513

if (Subtarget.hasVLX()) {

18514

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

18515

Subtarget, DAG))

18516

return Rotate;

18517

18518

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

18519

DAG, Subtarget))

18520

return V;

18521

}

18522

18523

// Try to use byte rotation instructions.

18524

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

18525

Subtarget, DAG))

18526

return Rotate;

18527

18528

// Try to create an in-lane repeating shuffle mask and then shuffle the

18529

// results into the target lanes.

18530

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18531

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18532

return V;

18533

18534

if (V2.isUndef()) {

18535

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18536

// because that should be faster than the variable permute alternatives.

18537

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

18538

return V;

18539

18540

// If the shuffle patterns aren't repeated but it's a single input, directly

18541

// generate a cross-lane VPERMD instruction.

18542

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18543

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

18544

}

18545

18546

// Assume that a single SHUFPS is faster than an alternative sequence of

18547

// multiple instructions (even if the CPU has a domain penalty).

18548

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

18549

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

18550

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

18551

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

18552

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

18553

CastV1, CastV2, DAG);

18554

return DAG.getBitcast(MVT::v8i32, ShufPS);

18555

}

18556

18557

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18558

// shuffle.

18559

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18560

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18561

return Result;

18562

18563

// Otherwise fall back on generic blend lowering.

18564

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

18565

Subtarget, DAG);

18566

}

18567

18568

/// Handle lowering of 16-lane 16-bit integer shuffles.

18569

///

18570

/// This routine is only called when we have AVX2 and thus a reasonable

18571

/// instruction set for v16i16 shuffling..

18572

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18573

const APInt &Zeroable, SDValue V1, SDValue V2,

18574

const X86Subtarget &Subtarget,

18575

SelectionDAG &DAG) {

18576

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18576, __extension__
__PRETTY_FUNCTION__));

18577

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18577, __extension__
__PRETTY_FUNCTION__));

18578

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18578, __extension__
__PRETTY_FUNCTION__));

18579

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18579, __extension__
__PRETTY_FUNCTION__));

18580

18581

// Whenever we can lower this as a zext, that instruction is strictly faster

18582

// than any alternative. It also allows us to fold memory operands into the

18583

// shuffle in many cases.

18584

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18585

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

18586

return ZExt;

18587

18588

// Check for being able to broadcast a single element.

18589

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

18590

Subtarget, DAG))

18591

return Broadcast;

18592

18593

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

18594

Zeroable, Subtarget, DAG))

18595

return Blend;

18596

18597

// Use dedicated unpack instructions for masks that match their pattern.

18598

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

18599

return V;

18600

18601

// Use dedicated pack instructions for masks that match their pattern.

18602

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

18603

Subtarget))

18604

return V;

18605

18606

// Try to use lower using a truncation.

18607

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18608

Subtarget, DAG))

18609

return V;

18610

18611

// Try to use shift instructions.

18612

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,

18613

Zeroable, Subtarget, DAG))

18614

return Shift;

18615

18616

// Try to use byte rotation instructions.

18617

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

18618

Subtarget, DAG))

18619

return Rotate;

18620

18621

// Try to create an in-lane repeating shuffle mask and then shuffle the

18622

// results into the target lanes.

18623

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18624

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18625

return V;

18626

18627

if (V2.isUndef()) {

18628

// Try to use bit rotation instructions.

18629

if (SDValue Rotate =

18630

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

18631

return Rotate;

18632

18633

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18634

// because that should be faster than the variable permute alternatives.

18635

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

18636

return V;

18637

18638

// There are no generalized cross-lane shuffle operations available on i16

18639

// element types.

18640

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

18641

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18642

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18643

return V;

18644

18645

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

18646

DAG, Subtarget);

18647

}

18648

18649

SmallVector<int, 8> RepeatedMask;

18650

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

18651

// As this is a single-input shuffle, the repeated mask should be

18652

// a strictly valid v8i16 mask that we can pass through to the v8i16

18653

// lowering to handle even the v16 case.

18654

return lowerV8I16GeneralSingleInputShuffle(

18655

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

18656

}

18657

}

18658

18659

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

18660

Zeroable, Subtarget, DAG))

18661

return PSHUFB;

18662

18663

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

18664

if (Subtarget.hasBWI())

18665

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

18666

18667

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18668

// shuffle.

18669

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18670

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18671

return Result;

18672

18673

// Try to permute the lanes and then use a per-lane permute.

18674

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18675

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18676

return V;

18677

18678

// Try to match an interleave of two v16i16s and lower them as unpck and

18679

// permutes using ymms.

18680

if (!Subtarget.hasAVX512())

18681

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,

18682

Mask, DAG))

18683

return V;

18684

18685

// Otherwise fall back on generic lowering.

18686

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

18687

Subtarget, DAG);

18688

}

18689

18690

/// Handle lowering of 32-lane 8-bit integer shuffles.

18691

///

18692

/// This routine is only called when we have AVX2 and thus a reasonable

18693

/// instruction set for v32i8 shuffling..

18694

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18695

const APInt &Zeroable, SDValue V1, SDValue V2,

18696

const X86Subtarget &Subtarget,

18697

SelectionDAG &DAG) {

18698

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18698, __extension__
__PRETTY_FUNCTION__));

18699

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18699, __extension__
__PRETTY_FUNCTION__));

18700

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18700, __extension__
__PRETTY_FUNCTION__));

18701

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18701, __extension__
__PRETTY_FUNCTION__));

18702

18703

// Whenever we can lower this as a zext, that instruction is strictly faster

18704

// than any alternative. It also allows us to fold memory operands into the

18705

// shuffle in many cases.

18706

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

18707

Zeroable, Subtarget, DAG))

18708

return ZExt;

18709

18710

// Check for being able to broadcast a single element.

18711

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

18712

Subtarget, DAG))

18713

return Broadcast;

18714

18715

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

18716

Zeroable, Subtarget, DAG))

18717

return Blend;

18718

18719

// Use dedicated unpack instructions for masks that match their pattern.

18720

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

18721

return V;

18722

18723

// Use dedicated pack instructions for masks that match their pattern.

18724

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

18725

Subtarget))

18726

return V;

18727

18728

// Try to use lower using a truncation.

18729

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

18730

Subtarget, DAG))

18731

return V;

18732

18733

// Try to use shift instructions.

18734

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,

18735

Zeroable, Subtarget, DAG))

18736

return Shift;

18737

18738

// Try to use byte rotation instructions.

18739

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

18740

Subtarget, DAG))

18741

return Rotate;

18742

18743

// Try to use bit rotation instructions.

18744

if (V2.isUndef())

18745

if (SDValue Rotate =

18746

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

18747

return Rotate;

18748

18749

// Try to create an in-lane repeating shuffle mask and then shuffle the

18750

// results into the target lanes.

18751

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18752

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18753

return V;

18754

18755

// There are no generalized cross-lane shuffle operations available on i8

18756

// element types.

18757

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

18758

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18759

// because that should be faster than the variable permute alternatives.

18760

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

18761

return V;

18762

18763

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18764

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18765

return V;

18766

18767

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

18768

DAG, Subtarget);

18769

}

18770

18771

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

18772

Zeroable, Subtarget, DAG))

18773

return PSHUFB;

18774

18775

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

18776

if (Subtarget.hasVBMI())

18777

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

18778

18779

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18780

// shuffle.

18781

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18782

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18783

return Result;

18784

18785

// Try to permute the lanes and then use a per-lane permute.

18786

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18787

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18788

return V;

18789

18790

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18791

// by zeroable elements in the remaining 24 elements. Turn this into two

18792

// vmovqb instructions shuffled together.

18793

if (Subtarget.hasVLX())

18794

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

18795

Mask, Zeroable, DAG))

18796

return V;

18797

18798

// Try to match an interleave of two v32i8s and lower them as unpck and

18799

// permutes using ymms.

18800

if (!Subtarget.hasAVX512())

18801

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,

18802

Mask, DAG))

18803

return V;

18804

18805

// Otherwise fall back on generic lowering.

18806

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

18807

Subtarget, DAG);

18808

}

18809

18810

/// High-level routine to lower various 256-bit x86 vector shuffles.

18811

///

18812

/// This routine either breaks down the specific type of a 256-bit x86 vector

18813

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

18814

/// together based on the available instructions.

18815

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

18816

SDValue V1, SDValue V2, const APInt &Zeroable,

18817

const X86Subtarget &Subtarget,

18818

SelectionDAG &DAG) {

18819

// If we have a single input to the zero element, insert that into V1 if we

18820

// can do so cheaply.

18821

int NumElts = VT.getVectorNumElements();

18822

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

18823

18824

if (NumV2Elements == 1 && Mask[0] >= NumElts)

18825

if (SDValue Insertion = lowerShuffleAsElementInsertion(

18826

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

18827

return Insertion;

18828

18829

// Handle special cases where the lower or upper half is UNDEF.

18830

if (SDValue V =

18831

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

18832

return V;

18833

18834

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

18835

// can check for those subtargets here and avoid much of the subtarget

18836

// querying in the per-vector-type lowering routines. With AVX1 we have

18837

// essentially *zero* ability to manipulate a 256-bit vector with integer

18838

// types. Since we'll use floating point types there eventually, just

18839

// immediately cast everything to a float and operate entirely in that domain.

18840

if (VT.isInteger() && !Subtarget.hasAVX2()) {

18841

int ElementBits = VT.getScalarSizeInBits();

18842

if (ElementBits < 32) {

18843

// No floating point type available, if we can't use the bit operations

18844

// for masking/blending then decompose into 128-bit vectors.

18845

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

18846

Subtarget, DAG))

18847

return V;

18848

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

18849

return V;

18850

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

18851

}

18852

18853

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

18854

VT.getVectorNumElements());

18855

V1 = DAG.getBitcast(FpVT, V1);

18856

V2 = DAG.getBitcast(FpVT, V2);

18857

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

18858

}

18859

18860

if (VT == MVT::v16f16) {

18861

V1 = DAG.getBitcast(MVT::v16i16, V1);

18862

V2 = DAG.getBitcast(MVT::v16i16, V2);

18863

return DAG.getBitcast(MVT::v16f16,

18864

DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

18865

}

18866

18867

switch (VT.SimpleTy) {

18868

case MVT::v4f64:

18869

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18870

case MVT::v4i64:

18871

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18872

case MVT::v8f32:

18873

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18874

case MVT::v8i32:

18875

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18876

case MVT::v16i16:

18877

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18878

case MVT::v32i8:

18879

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18880

18881

default:

18882

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18882);

18883

}

18884

}

18885

18886

/// Try to lower a vector shuffle as a 128-bit shuffles.

18887

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

18888

const APInt &Zeroable, SDValue V1, SDValue V2,

18889

const X86Subtarget &Subtarget,

18890

SelectionDAG &DAG) {

18891

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18892, __extension__
__PRETTY_FUNCTION__))

18892

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18892, __extension__
__PRETTY_FUNCTION__));

18893

18894

// To handle 256 bit vector requires VLX and most probably

18895

// function lowerV2X128VectorShuffle() is better solution.

18896

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18896, __extension__
__PRETTY_FUNCTION__));

18897

18898

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

18899

SmallVector<int, 4> Widened128Mask;

18900

if (!canWidenShuffleElements(Mask, Widened128Mask))

18901

return SDValue();

18902

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18902, __extension__
__PRETTY_FUNCTION__));

18903

18904

// Try to use an insert into a zero vector.

18905

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

18906

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

18907

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

18908

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

18909

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

18910

DAG.getIntPtrConstant(0, DL));

18911

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

18912

getZeroVector(VT, Subtarget, DAG, DL), LoV,

18913

DAG.getIntPtrConstant(0, DL));

18914

}

18915

18916

// Check for patterns which can be matched with a single insert of a 256-bit

18917

// subvector.

18918

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

18919

if (OnlyUsesV1 ||

18920

isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

18921

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

18922

SDValue SubVec =

18923

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

18924

DAG.getIntPtrConstant(0, DL));

18925

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

18926

DAG.getIntPtrConstant(4, DL));

18927

}

18928

18929

// See if this is an insertion of the lower 128-bits of V2 into V1.

18930

bool IsInsert = true;

18931

int V2Index = -1;

18932

for (int i = 0; i < 4; ++i) {

18933

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18933, __extension__
__PRETTY_FUNCTION__));

18934

if (Widened128Mask[i] < 0)

18935

continue;

18936

18937

// Make sure all V1 subvectors are in place.

18938

if (Widened128Mask[i] < 4) {

18939

if (Widened128Mask[i] != i) {

18940

IsInsert = false;

18941

break;

18942

}

18943

} else {

18944

// Make sure we only have a single V2 index and its the lowest 128-bits.

18945

if (V2Index >= 0 || Widened128Mask[i] != 4) {

18946

IsInsert = false;

18947

break;

18948

}

18949

V2Index = i;

18950

}

18951

}

18952

if (IsInsert && V2Index >= 0) {

18953

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

18954

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

18955

DAG.getIntPtrConstant(0, DL));

18956

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

18957

}

18958

18959

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

18960

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

18961

// possible we at least ensure the lanes stay sequential to help later

18962

// combines.

18963

SmallVector<int, 2> Widened256Mask;

18964

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

18965

Widened128Mask.clear();

18966

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

18967

}

18968

18969

// Try to lower to vshuf64x2/vshuf32x4.

18970

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

18971

unsigned PermMask = 0;

18972

// Insure elements came from the same Op.

18973

for (int i = 0; i < 4; ++i) {

18974

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18974, __extension__
__PRETTY_FUNCTION__));

18975

if (Widened128Mask[i] < 0)

18976

continue;

18977

18978

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

18979

unsigned OpIndex = i / 2;

18980

if (Ops[OpIndex].isUndef())

18981

Ops[OpIndex] = Op;

18982

else if (Ops[OpIndex] != Op)

18983

return SDValue();

18984

18985

// Convert the 128-bit shuffle mask selection values into 128-bit selection

18986

// bits defined by a vshuf64x2 instruction's immediate control byte.

18987

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

18988

}

18989

18990

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

18991

DAG.getTargetConstant(PermMask, DL, MVT::i8));

18992

}

18993

18994

/// Handle lowering of 8-lane 64-bit floating point shuffles.

18995

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18996

const APInt &Zeroable, SDValue V1, SDValue V2,

18997

const X86Subtarget &Subtarget,

18998

SelectionDAG &DAG) {

18999

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18999, __extension__
__PRETTY_FUNCTION__));

19000

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000, __extension__
__PRETTY_FUNCTION__));

19001

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19001, __extension__
__PRETTY_FUNCTION__));

19002

19003

if (V2.isUndef()) {

19004

// Use low duplicate instructions for masks that match their pattern.

19005

if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

19006

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

19007

19008

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

19009

// Non-half-crossing single input shuffles can be lowered with an

19010

// interleaved permutation.

19011

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

19012

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

19013

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

19014

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

19015

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

19016

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

19017

}

19018

19019

SmallVector<int, 4> RepeatedMask;

19020

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

19021

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

19022

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19023

}

19024

19025

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

19026

V2, Subtarget, DAG))

19027

return Shuf128;

19028

19029

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

19030

return Unpck;

19031

19032

// Check if the blend happens to exactly fit that of SHUFPD.

19033

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

19034

Zeroable, Subtarget, DAG))

19035

return Op;

19036

19037

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

19038

DAG, Subtarget))

19039

return V;

19040

19041

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

19042

Zeroable, Subtarget, DAG))

19043

return Blend;

19044

19045

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

19046

}

19047

19048

/// Handle lowering of 16-lane 32-bit floating point shuffles.

19049

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19050

const APInt &Zeroable, SDValue V1, SDValue V2,

19051

const X86Subtarget &Subtarget,

19052

SelectionDAG &DAG) {

19053

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19053, __extension__
__PRETTY_FUNCTION__));

19054

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19054, __extension__
__PRETTY_FUNCTION__));

19055

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19055, __extension__
__PRETTY_FUNCTION__));

19056

19057

// If the shuffle mask is repeated in each 128-bit lane, we have many more

19058

// options to efficiently lower the shuffle.

19059

SmallVector<int, 4> RepeatedMask;

19060

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

19061

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19061, __extension__
__PRETTY_FUNCTION__));

19062

19063

// Use even/odd duplicate instructions for masks that match their pattern.

19064

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

19065

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

19066

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

19067

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

19068

19069

if (V2.isUndef())

19070

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

19071

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19072

19073

// Use dedicated unpack instructions for masks that match their pattern.

19074

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

19075

return V;

19076

19077

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19078

Zeroable, Subtarget, DAG))

19079

return Blend;

19080

19081

// Otherwise, fall back to a SHUFPS sequence.

19082

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

19083

}

19084

19085

// Try to create an in-lane repeating shuffle mask and then shuffle the

19086

// results into the target lanes.

19087

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19088

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

19089

return V;

19090

19091

// If we have a single input shuffle with different shuffle patterns in the

19092

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

19093

if (V2.isUndef() &&

19094

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

19095

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

19096

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

19097

}

19098

19099

// If we have AVX512F support, we can use VEXPAND.

19100

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

19101

V1, V2, DAG, Subtarget))

19102

return V;

19103

19104

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

19105

}

19106

19107

/// Handle lowering of 8-lane 64-bit integer shuffles.

19108

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19109

const APInt &Zeroable, SDValue V1, SDValue V2,

19110

const X86Subtarget &Subtarget,

19111

SelectionDAG &DAG) {

19112

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19112, __extension__
__PRETTY_FUNCTION__));

19113

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19113, __extension__
__PRETTY_FUNCTION__));

19114

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19114, __extension__
__PRETTY_FUNCTION__));

19115

19116

if (V2.isUndef()) {

19117

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

19118

// can use lower latency instructions that will operate on all four

19119

// 128-bit lanes.

19120

SmallVector<int, 2> Repeated128Mask;

19121

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

19122

SmallVector<int, 4> PSHUFDMask;

19123

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

19124

return DAG.getBitcast(

19125

MVT::v8i64,

19126

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

19127

DAG.getBitcast(MVT::v16i32, V1),

19128

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

19129

}

19130

19131

SmallVector<int, 4> Repeated256Mask;

19132

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

19133

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

19134

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

19135

}

19136

19137

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

19138

V2, Subtarget, DAG))

19139

return Shuf128;

19140

19141

// Try to use shift instructions.

19142

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,

19143

Zeroable, Subtarget, DAG))

19144

return Shift;

19145

19146

// Try to use VALIGN.

19147

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

19148

Subtarget, DAG))

19149

return Rotate;

19150

19151

// Try to use PALIGNR.

19152

if (Subtarget.hasBWI())

19153

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

19154

Subtarget, DAG))

19155

return Rotate;

19156

19157

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

19158

return Unpck;

19159

19160

// If we have AVX512F support, we can use VEXPAND.

19161

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

19162

DAG, Subtarget))

19163

return V;

19164

19165

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

19166

Zeroable, Subtarget, DAG))

19167

return Blend;

19168

19169

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

19170

}

19171

19172

/// Handle lowering of 16-lane 32-bit integer shuffles.

19173

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19174

const APInt &Zeroable, SDValue V1, SDValue V2,

19175

const X86Subtarget &Subtarget,

19176

SelectionDAG &DAG) {

19177

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19177, __extension__
__PRETTY_FUNCTION__));

19178

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19178, __extension__
__PRETTY_FUNCTION__));

19179

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19179, __extension__
__PRETTY_FUNCTION__));

19180

19181

// Whenever we can lower this as a zext, that instruction is strictly faster

19182

// than any alternative. It also allows us to fold memory operands into the

19183

// shuffle in many cases.

19184

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19185

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19186

return ZExt;

19187

19188

// If the shuffle mask is repeated in each 128-bit lane we can use more

19189

// efficient instructions that mirror the shuffles across the four 128-bit

19190

// lanes.

19191

SmallVector<int, 4> RepeatedMask;

19192

bool Is128BitLaneRepeatedShuffle =

19193

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

19194

if (Is128BitLaneRepeatedShuffle) {

19195

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19195, __extension__
__PRETTY_FUNCTION__));

19196

if (V2.isUndef())

19197

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

19198

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19199

19200

// Use dedicated unpack instructions for masks that match their pattern.

19201

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

19202

return V;

19203

}

19204

19205

// Try to use shift instructions.

19206

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,

19207

Zeroable, Subtarget, DAG))

19208

return Shift;

19209

19210

// Try to use VALIGN.

19211

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

19212

Subtarget, DAG))

19213

return Rotate;

19214

19215

// Try to use byte rotation instructions.

19216

if (Subtarget.hasBWI())

19217

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

19218

Subtarget, DAG))

19219

return Rotate;

19220

19221

// Assume that a single SHUFPS is faster than using a permv shuffle.

19222

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

19223

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

19224

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

19225

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

19226

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

19227

CastV1, CastV2, DAG);

19228

return DAG.getBitcast(MVT::v16i32, ShufPS);

19229

}

19230

19231

// Try to create an in-lane repeating shuffle mask and then shuffle the

19232

// results into the target lanes.

19233

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19234

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

19235

return V;

19236

19237

// If we have AVX512F support, we can use VEXPAND.

19238

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

19239

DAG, Subtarget))

19240

return V;

19241

19242

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

19243

Zeroable, Subtarget, DAG))

19244

return Blend;

19245

19246

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

19247

}

19248

19249

/// Handle lowering of 32-lane 16-bit integer shuffles.

19250

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19251

const APInt &Zeroable, SDValue V1, SDValue V2,

19252

const X86Subtarget &Subtarget,

19253

SelectionDAG &DAG) {

19254

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19254, __extension__
__PRETTY_FUNCTION__));

19255

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19255, __extension__
__PRETTY_FUNCTION__));

19256

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19256, __extension__
__PRETTY_FUNCTION__));

19257

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19257, __extension__
__PRETTY_FUNCTION__));

19258

19259

// Whenever we can lower this as a zext, that instruction is strictly faster

19260

// than any alternative. It also allows us to fold memory operands into the

19261

// shuffle in many cases.

19262

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19263

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

19264

return ZExt;

19265

19266

// Use dedicated unpack instructions for masks that match their pattern.

19267

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

19268

return V;

19269

19270

// Use dedicated pack instructions for masks that match their pattern.

19271

if (SDValue V =

19272

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

19273

return V;

19274

19275

// Try to use shift instructions.

19276

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,

19277

Zeroable, Subtarget, DAG))

19278

return Shift;

19279

19280

// Try to use byte rotation instructions.

19281

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

19282

Subtarget, DAG))

19283

return Rotate;

19284

19285

if (V2.isUndef()) {

19286

// Try to use bit rotation instructions.

19287

if (SDValue Rotate =

19288

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

19289

return Rotate;

19290

19291

SmallVector<int, 8> RepeatedMask;

19292

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

19293

// As this is a single-input shuffle, the repeated mask should be

19294

// a strictly valid v8i16 mask that we can pass through to the v8i16

19295

// lowering to handle even the v32 case.

19296

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

19297

RepeatedMask, Subtarget, DAG);

19298

}

19299

}

19300

19301

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

19302

Zeroable, Subtarget, DAG))

19303

return Blend;

19304

19305

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

19306

Zeroable, Subtarget, DAG))

19307

return PSHUFB;

19308

19309

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

19310

}

19311

19312

/// Handle lowering of 64-lane 8-bit integer shuffles.

19313

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19314

const APInt &Zeroable, SDValue V1, SDValue V2,

19315

const X86Subtarget &Subtarget,

19316

SelectionDAG &DAG) {

19317

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19317, __extension__
__PRETTY_FUNCTION__));

19318

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19318, __extension__
__PRETTY_FUNCTION__));

19319

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19319, __extension__
__PRETTY_FUNCTION__));

19320

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19320, __extension__
__PRETTY_FUNCTION__));

19321

19322

// Whenever we can lower this as a zext, that instruction is strictly faster

19323

// than any alternative. It also allows us to fold memory operands into the

19324

// shuffle in many cases.

19325

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19326

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

19327

return ZExt;

19328

19329

// Use dedicated unpack instructions for masks that match their pattern.

19330

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

19331

return V;

19332

19333

// Use dedicated pack instructions for masks that match their pattern.

19334

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

19335

Subtarget))

19336

return V;

19337

19338

// Try to use shift instructions.

19339

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,

19340

Zeroable, Subtarget, DAG))

19341

return Shift;

19342

19343

// Try to use byte rotation instructions.

19344

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

19345

Subtarget, DAG))

19346

return Rotate;

19347

19348

// Try to use bit rotation instructions.

19349

if (V2.isUndef())

19350

if (SDValue Rotate =

19351

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

19352

return Rotate;

19353

19354

// Lower as AND if possible.

19355

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

19356

Zeroable, Subtarget, DAG))

19357

return Masked;

19358

19359

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

19360

Zeroable, Subtarget, DAG))

19361

return PSHUFB;

19362

19363

// Try to create an in-lane repeating shuffle mask and then shuffle the

19364

// results into the target lanes.

19365

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19366

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19367

return V;

19368

19369

if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

19370

DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

19371

return Result;

19372

19373

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

19374

Zeroable, Subtarget, DAG))

19375

return Blend;

19376

19377

if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

19378

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

19379

// PALIGNR will be cheaper than the second PSHUFB+OR.

19380

if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

19381

Mask, Subtarget, DAG))

19382

return V;

19383

19384

// If we can't directly blend but can use PSHUFB, that will be better as it

19385

// can both shuffle and set up the inefficient blend.

19386

bool V1InUse, V2InUse;

19387

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

19388

DAG, V1InUse, V2InUse);

19389

}

19390

19391

// Try to simplify this by merging 128-bit lanes to enable a lane-based

19392

// shuffle.

19393

if (!V2.isUndef())

19394

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

19395

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19396

return Result;

19397

19398

// VBMI can use VPERMV/VPERMV3 byte shuffles.

19399

if (Subtarget.hasVBMI())

19400

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

19401

19402

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);

19403

}

19404

19405

/// High-level routine to lower various 512-bit x86 vector shuffles.

19406

///

19407

/// This routine either breaks down the specific type of a 512-bit x86 vector

19408

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

19409

/// together based on the available instructions.

19410

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19411

MVT VT, SDValue V1, SDValue V2,

19412

const APInt &Zeroable,

19413

const X86Subtarget &Subtarget,

19414

SelectionDAG &DAG) {

19415

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19416, __extension__
__PRETTY_FUNCTION__))

19416

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19416, __extension__
__PRETTY_FUNCTION__));

19417

19418

// If we have a single input to the zero element, insert that into V1 if we

19419

// can do so cheaply.

19420

int NumElts = Mask.size();

19421

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

19422

19423

if (NumV2Elements == 1 && Mask[0] >= NumElts)

19424

if (SDValue Insertion = lowerShuffleAsElementInsertion(

19425

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

19426

return Insertion;

19427

19428

// Handle special cases where the lower or upper half is UNDEF.

19429

if (SDValue V =

19430

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

19431

return V;

19432

19433

// Check for being able to broadcast a single element.

19434

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

19435

Subtarget, DAG))

19436

return Broadcast;

19437

19438

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

19439

// Try using bit ops for masking and blending before falling back to

19440

// splitting.

19441

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

19442

Subtarget, DAG))

19443

return V;

19444

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

19445

return V;

19446

19447

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

19448

}

19449

19450

if (VT == MVT::v32f16) {

19451

V1 = DAG.getBitcast(MVT::v32i16, V1);

19452

V2 = DAG.getBitcast(MVT::v32i16, V2);

19453

return DAG.getBitcast(MVT::v32f16,

19454

DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

19455

}

19456

19457

// Dispatch to each element type for lowering. If we don't have support for

19458

// specific element type shuffles at 512 bits, immediately split them and

19459

// lower them. Each lowering routine of a given type is allowed to assume that

19460

// the requisite ISA extensions for that element type are available.

19461

switch (VT.SimpleTy) {

19462

case MVT::v8f64:

19463

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19464

case MVT::v16f32:

19465

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19466

case MVT::v8i64:

19467

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19468

case MVT::v16i32:

19469

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19470

case MVT::v32i16:

19471

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19472

case MVT::v64i8:

19473

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19474

19475

default:

19476

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19476);

19477

}

19478

}

19479

19480

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

19481

MVT VT, SDValue V1, SDValue V2,

19482

const X86Subtarget &Subtarget,

19483

SelectionDAG &DAG) {

19484

// Shuffle should be unary.

19485

if (!V2.isUndef())

19486

return SDValue();

19487

19488

int ShiftAmt = -1;

19489

int NumElts = Mask.size();

19490

for (int i = 0; i != NumElts; ++i) {

19491

int M = Mask[i];

19492

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19493, __extension__
__PRETTY_FUNCTION__))

19493

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19493, __extension__
__PRETTY_FUNCTION__));

19494

if (M < 0)

19495

continue;

19496

19497

// The first non-undef element determines our shift amount.

19498

if (ShiftAmt < 0) {

19499

ShiftAmt = M - i;

19500

// Need to be shifting right.

19501

if (ShiftAmt <= 0)

19502

return SDValue();

19503

}

19504

// All non-undef elements must shift by the same amount.

19505

if (ShiftAmt != M - i)

19506

return SDValue();

19507

}

19508

assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19508, __extension__
__PRETTY_FUNCTION__));

19509

19510

// Great we found a shift right.

19511

MVT WideVT = VT;

19512

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19513

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19514

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19515

DAG.getUNDEF(WideVT), V1,

19516

DAG.getIntPtrConstant(0, DL));

19517

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

19518

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19519

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19520

DAG.getIntPtrConstant(0, DL));

19521

}

19522

19523

// Determine if this shuffle can be implemented with a KSHIFT instruction.

19524

// Returns the shift amount if possible or -1 if not. This is a simplified

19525

// version of matchShuffleAsShift.

19526

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

19527

int MaskOffset, const APInt &Zeroable) {

19528

int Size = Mask.size();

19529

19530

auto CheckZeros = [&](int Shift, bool Left) {

19531

for (int j = 0; j < Shift; ++j)

19532

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

19533

return false;

19534

19535

return true;

19536

};

19537

19538

auto MatchShift = [&](int Shift, bool Left) {

19539

unsigned Pos = Left ? Shift : 0;

19540

unsigned Low = Left ? 0 : Shift;

19541

unsigned Len = Size - Shift;

19542

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

19543

};

19544

19545

for (int Shift = 1; Shift != Size; ++Shift)

19546

for (bool Left : {true, false})

19547

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

19548

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

19549

return Shift;

19550

}

19551

19552

return -1;

19553

}

19554

19555

19556

// Lower vXi1 vector shuffles.

19557

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

19558

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

19559

// vector, shuffle and then truncate it back.

19560

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19561

MVT VT, SDValue V1, SDValue V2,

19562

const APInt &Zeroable,

19563

const X86Subtarget &Subtarget,

19564

SelectionDAG &DAG) {

19565

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19566, __extension__
__PRETTY_FUNCTION__))

19566

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19566, __extension__
__PRETTY_FUNCTION__));

19567

19568

int NumElts = Mask.size();

19569

19570

// Try to recognize shuffles that are just padding a subvector with zeros.

19571

int SubvecElts = 0;

19572

int Src = -1;

19573

for (int i = 0; i != NumElts; ++i) {

19574

if (Mask[i] >= 0) {

19575

// Grab the source from the first valid mask. All subsequent elements need

19576

// to use this same source.

19577

if (Src < 0)

19578

Src = Mask[i] / NumElts;

19579

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

19580

break;

19581

}

19582

19583

++SubvecElts;

19584

}

19585

assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19585, __extension__
__PRETTY_FUNCTION__));

19586

19587

// Clip to a power 2.

19588

SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);

19589

19590

// Make sure the number of zeroable bits in the top at least covers the bits

19591

// not covered by the subvector.

19592

if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {

19593

assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19593, __extension__
__PRETTY_FUNCTION__));

19594

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

19595

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

19596

Src == 0 ? V1 : V2,

19597

DAG.getIntPtrConstant(0, DL));

19598

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19599

DAG.getConstant(0, DL, VT),

19600

Extract, DAG.getIntPtrConstant(0, DL));

19601

}

19602

19603

// Try a simple shift right with undef elements. Later we'll try with zeros.

19604

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

19605

DAG))

19606

return Shift;

19607

19608

// Try to match KSHIFTs.

19609

unsigned Offset = 0;

19610

for (SDValue V : { V1, V2 }) {

19611

unsigned Opcode;

19612

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

19613

if (ShiftAmt >= 0) {

19614

MVT WideVT = VT;

19615

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19616

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19617

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19618

DAG.getUNDEF(WideVT), V,

19619

DAG.getIntPtrConstant(0, DL));

19620

// Widened right shifts need two shifts to ensure we shift in zeroes.

19621

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

19622

int WideElts = WideVT.getVectorNumElements();

19623

// Shift left to put the original vector in the MSBs of the new size.

19624

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

19625

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

19626

// Increase the shift amount to account for the left shift.

19627

ShiftAmt += WideElts - NumElts;

19628

}

19629

19630

Res = DAG.getNode(Opcode, DL, WideVT, Res,

19631

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19632

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19633

DAG.getIntPtrConstant(0, DL));

19634

}

19635

Offset += NumElts; // Increment for next iteration.

19636

}

19637

19638

// If we're broadcasting a SETCC result, try to broadcast the ops instead.

19639

// TODO: What other unary shuffles would benefit from this?

19640

if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&

19641

V1->hasOneUse()) {

19642

SDValue Op0 = V1.getOperand(0);

19643

SDValue Op1 = V1.getOperand(1);

19644

ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

19645

EVT OpVT = Op0.getValueType();

19646

return DAG.getSetCC(

19647

DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

19648

DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

19649

}

19650

19651

MVT ExtVT;

19652

switch (VT.SimpleTy) {

19653

default:

19654

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19654);

19655

case MVT::v2i1:

19656

ExtVT = MVT::v2i64;

19657

break;

19658

case MVT::v4i1:

19659

ExtVT = MVT::v4i32;

19660

break;

19661

case MVT::v8i1:

19662

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

19663

// shuffle.

19664

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

19665

break;

19666

case MVT::v16i1:

19667

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19668

// 256-bit operation available.

19669

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

19670

break;

19671

case MVT::v32i1:

19672

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19673

// 256-bit operation available.

19674

assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19674, __extension__
__PRETTY_FUNCTION__));

19675

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

19676

break;

19677

case MVT::v64i1:

19678

// Fall back to scalarization. FIXME: We can do better if the shuffle

19679

// can be partitioned cleanly.

19680

if (!Subtarget.useBWIRegs())

19681

return SDValue();

19682

ExtVT = MVT::v64i8;

19683

break;

19684

}

19685

19686

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

19687

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

19688

19689

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

19690

// i1 was sign extended we can use X86ISD::CVT2MASK.

19691

int NumElems = VT.getVectorNumElements();

19692

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

19693

(Subtarget.hasDQI() && (NumElems < 32)))

19694

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

19695

Shuffle, ISD::SETGT);

19696

19697

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

19698

}

19699

19700

/// Helper function that returns true if the shuffle mask should be

19701

/// commuted to improve canonicalization.

19702

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

19703

int NumElements = Mask.size();

19704

19705

int NumV1Elements = 0, NumV2Elements = 0;

19706

for (int M : Mask)

19707

if (M < 0)

19708

continue;

19709

else if (M < NumElements)

19710

++NumV1Elements;

19711

else

19712

++NumV2Elements;

19713

19714

// Commute the shuffle as needed such that more elements come from V1 than

19715

// V2. This allows us to match the shuffle pattern strictly on how many

19716

// elements come from V1 without handling the symmetric cases.

19717

if (NumV2Elements > NumV1Elements)

19718

return true;

19719

19720

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19720, __extension__
__PRETTY_FUNCTION__));

19721

19722

if (NumV2Elements == 0)

19723

return false;

19724

19725

// When the number of V1 and V2 elements are the same, try to minimize the

19726

// number of uses of V2 in the low half of the vector. When that is tied,

19727

// ensure that the sum of indices for V1 is equal to or lower than the sum

19728

// indices for V2. When those are equal, try to ensure that the number of odd

19729

// indices for V1 is lower than the number of odd indices for V2.

19730

if (NumV1Elements == NumV2Elements) {

19731

int LowV1Elements = 0, LowV2Elements = 0;

19732

for (int M : Mask.slice(0, NumElements / 2))

19733

if (M >= NumElements)

19734

++LowV2Elements;

19735

else if (M >= 0)

19736

++LowV1Elements;

19737

if (LowV2Elements > LowV1Elements)

19738

return true;

19739

if (LowV2Elements == LowV1Elements) {

19740

int SumV1Indices = 0, SumV2Indices = 0;

19741

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19742

if (Mask[i] >= NumElements)

19743

SumV2Indices += i;

19744

else if (Mask[i] >= 0)

19745

SumV1Indices += i;

19746

if (SumV2Indices < SumV1Indices)

19747

return true;

19748

if (SumV2Indices == SumV1Indices) {

19749

int NumV1OddIndices = 0, NumV2OddIndices = 0;

19750

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19751

if (Mask[i] >= NumElements)

19752

NumV2OddIndices += i % 2;

19753

else if (Mask[i] >= 0)

19754

NumV1OddIndices += i % 2;

19755

if (NumV2OddIndices < NumV1OddIndices)

19756

return true;

19757

}

19758

}

19759

}

19760

19761

return false;

19762

}

19763

19764

static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,

19765

const X86Subtarget &Subtarget) {

19766

if (!Subtarget.hasAVX512())

19767

return false;

19768

19769

MVT VT = V1.getSimpleValueType().getScalarType();

19770

if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())

19771

return false;

19772

19773

// i8 is better to be widen to i16, because there is PBLENDW for vXi16

19774

// when the vector bit size is 128 or 256.

19775

if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512)

19776

return false;

19777

19778

auto HasMaskOperation = [&](SDValue V) {

19779

// TODO: Currently we only check limited opcode. We probably extend

19780

// it to all binary operation by checking TLI.isBinOp().

19781

switch (V->getOpcode()) {

19782

default:

19783

return false;

19784

case ISD::ADD:

19785

case ISD::SUB:

19786

case ISD::AND:

19787

case ISD::XOR:

19788

break;

19789

}

19790

if (!V->hasOneUse())

19791

return false;

19792

19793

return true;

19794

};

19795

19796

if (HasMaskOperation(V1) || HasMaskOperation(V2))

19797

return true;

19798

19799

return false;

19800

}

19801

19802

// Forward declaration.

19803

static SDValue canonicalizeShuffleMaskWithHorizOp(

19804

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

19805

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

19806

const X86Subtarget &Subtarget);

19807

19808

/// Top-level lowering for x86 vector shuffles.

19809

///

19810

/// This handles decomposition, canonicalization, and lowering of all x86

19811

/// vector shuffles. Most of the specific lowering strategies are encapsulated

19812

/// above in helper routines. The canonicalization attempts to widen shuffles

19813

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

19814

/// s.t. only one of the two inputs needs to be tested, etc.

19815

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

19816

SelectionDAG &DAG) {

19817

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

19818

ArrayRef<int> OrigMask = SVOp->getMask();

19819

SDValue V1 = Op.getOperand(0);

19820

SDValue V2 = Op.getOperand(1);

19821

MVT VT = Op.getSimpleValueType();

19822

int NumElements = VT.getVectorNumElements();

19823

SDLoc DL(Op);

19824

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

19825

19826

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19827, __extension__
__PRETTY_FUNCTION__))

19827

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19827, __extension__
__PRETTY_FUNCTION__));

19828

19829

bool V1IsUndef = V1.isUndef();

19830

bool V2IsUndef = V2.isUndef();

19831

if (V1IsUndef && V2IsUndef)

19832

return DAG.getUNDEF(VT);

19833

19834

// When we create a shuffle node we put the UNDEF node to second operand,

19835

// but in some cases the first operand may be transformed to UNDEF.

19836

// In this case we should just commute the node.

19837

if (V1IsUndef)

19838

return DAG.getCommutedVectorShuffle(*SVOp);

19839

19840

// Check for non-undef masks pointing at an undef vector and make the masks

19841

// undef as well. This makes it easier to match the shuffle based solely on

19842

// the mask.

19843

if (V2IsUndef &&

19844

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

19845

SmallVector<int, 8> NewMask(OrigMask);

19846

for (int &M : NewMask)

19847

if (M >= NumElements)

19848

M = -1;

19849

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

19850

}

19851

19852

// Check for illegal shuffle mask element index values.

19853

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

19854

(void)MaskUpperLimit;

19855

assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19857, __extension__
__PRETTY_FUNCTION__))

19856

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19857, __extension__
__PRETTY_FUNCTION__))

19857

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19857, __extension__
__PRETTY_FUNCTION__));

19858

19859

// We actually see shuffles that are entirely re-arrangements of a set of

19860

// zero inputs. This mostly happens while decomposing complex shuffles into

19861

// simple ones. Directly lower these as a buildvector of zeros.

19862

APInt KnownUndef, KnownZero;

19863

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

19864

19865

APInt Zeroable = KnownUndef | KnownZero;

19866

if (Zeroable.isAllOnes())

19867

return getZeroVector(VT, Subtarget, DAG, DL);

19868

19869

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

19870

19871

// Try to collapse shuffles into using a vector type with fewer elements but

19872

// wider element types. We cap this to not form integers or floating point

19873

// elements wider than 64 bits. It does not seem beneficial to form i128

19874

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

19875

SmallVector<int, 16> WidenedMask;

19876

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

19877

!canCombineAsMaskOperation(V1, V2, Subtarget) &&

19878

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

19879

// Shuffle mask widening should not interfere with a broadcast opportunity

19880

// by obfuscating the operands with bitcasts.

19881

// TODO: Avoid lowering directly from this top-level function: make this

19882

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

19883

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

19884

Subtarget, DAG))

19885

return Broadcast;

19886

19887

MVT NewEltVT = VT.isFloatingPoint()

19888

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

19889

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

19890

int NewNumElts = NumElements / 2;

19891

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

19892

// Make sure that the new vector type is legal. For example, v2f64 isn't

19893

// legal on SSE1.

19894

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

19895

if (V2IsZero) {

19896

// Modify the new Mask to take all zeros from the all-zero vector.

19897

// Choose indices that are blend-friendly.

19898

bool UsedZeroVector = false;

19899

assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19900, __extension__
__PRETTY_FUNCTION__))

19900

"V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19900, __extension__
__PRETTY_FUNCTION__));

19901

for (int i = 0; i != NewNumElts; ++i)

19902

if (WidenedMask[i] == SM_SentinelZero) {

19903

WidenedMask[i] = i + NewNumElts;

19904

UsedZeroVector = true;

19905

}

19906

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

19907

// some elements to be undef.

19908

if (UsedZeroVector)

19909

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

19910

}

19911

V1 = DAG.getBitcast(NewVT, V1);

19912

V2 = DAG.getBitcast(NewVT, V2);

19913

return DAG.getBitcast(

19914

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

19915

}

19916

}

19917

19918

SmallVector<SDValue> Ops = {V1, V2};

19919

SmallVector<int> Mask(OrigMask);

19920

19921

// Canonicalize the shuffle with any horizontal ops inputs.

19922

// NOTE: This may update Ops and Mask.

19923

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

19924

Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

19925

return DAG.getBitcast(VT, HOp);

19926

19927

V1 = DAG.getBitcast(VT, Ops[0]);

19928

V2 = DAG.getBitcast(VT, Ops[1]);

19929

assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19931, __extension__
__PRETTY_FUNCTION__))

19930

"canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19931, __extension__
__PRETTY_FUNCTION__))

19931

"shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19931, __extension__
__PRETTY_FUNCTION__));

19932

19933

// Commute the shuffle if it will improve canonicalization.

19934

if (canonicalizeShuffleMaskWithCommute(Mask)) {

19935

ShuffleVectorSDNode::commuteMask(Mask);

19936

std::swap(V1, V2);

19937

}

19938

19939

// For each vector width, delegate to a specialized lowering routine.

19940

if (VT.is128BitVector())

19941

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19942

19943

if (VT.is256BitVector())

19944

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19945

19946

if (VT.is512BitVector())

19947

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19948

19949

if (Is1BitVector)

19950

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19951

19952

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19952);

19953

}

19954

19955

/// Try to lower a VSELECT instruction to a vector shuffle.

19956

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

19957

const X86Subtarget &Subtarget,

19958

SelectionDAG &DAG) {

19959

SDValue Cond = Op.getOperand(0);

19960

SDValue LHS = Op.getOperand(1);

19961

SDValue RHS = Op.getOperand(2);

19962

MVT VT = Op.getSimpleValueType();

19963

19964

// Only non-legal VSELECTs reach this lowering, convert those into generic

19965

// shuffles and re-use the shuffle lowering path for blends.

19966

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

19967

SmallVector<int, 32> Mask;

19968

if (createShuffleMaskFromVSELECT(Mask, Cond))

19969

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

19970

}

19971

19972

return SDValue();

19973

}

19974

19975

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

19976

SDValue Cond = Op.getOperand(0);

19977

SDValue LHS = Op.getOperand(1);

19978

SDValue RHS = Op.getOperand(2);

19979

19980

SDLoc dl(Op);

19981

MVT VT = Op.getSimpleValueType();

19982

if (isSoftFP16(VT)) {

19983

MVT NVT = VT.changeVectorElementTypeToInteger();

19984

return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,

19985

DAG.getBitcast(NVT, LHS),

19986

DAG.getBitcast(NVT, RHS)));

19987

}

19988

19989

// A vselect where all conditions and data are constants can be optimized into

19990

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

19991

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

19992

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

19993

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

19994

return SDValue();

19995

19996

// Try to lower this to a blend-style vector shuffle. This can handle all

19997

// constant condition cases.

19998

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

19999

return BlendOp;

20000

20001

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

20002

// with patterns on the mask registers on AVX-512.

20003

MVT CondVT = Cond.getSimpleValueType();

20004

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

20005

if (CondEltSize == 1)

20006

return Op;

20007

20008

// Variable blends are only legal from SSE4.1 onward.

20009

if (!Subtarget.hasSSE41())

20010

return SDValue();

20011

20012

unsigned EltSize = VT.getScalarSizeInBits();

20013

unsigned NumElts = VT.getVectorNumElements();

20014

20015

// Expand v32i16/v64i8 without BWI.

20016

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

20017

return SDValue();

20018

20019

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

20020

// into an i1 condition so that we can use the mask-based 512-bit blend

20021

// instructions.

20022

if (VT.getSizeInBits() == 512) {

20023

// Build a mask by testing the condition against zero.

20024

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

20025

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

20026

DAG.getConstant(0, dl, CondVT),

20027

ISD::SETNE);

20028

// Now return a new VSELECT using the mask.

20029

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

20030

}

20031

20032

// SEXT/TRUNC cases where the mask doesn't match the destination size.

20033

if (CondEltSize != EltSize) {

20034

// If we don't have a sign splat, rely on the expansion.

20035

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

20036

return SDValue();

20037

20038

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

20039

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

20040

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

20041

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

20042

}

20043

20044

// Only some types will be legal on some subtargets. If we can emit a legal

20045

// VSELECT-matching blend, return Op, and but if we need to expand, return

20046

// a null value.

20047

switch (VT.SimpleTy) {

20048

default:

20049

// Most of the vector types have blends past SSE4.1.

20050

return Op;

20051

20052

case MVT::v32i8:

20053

// The byte blends for AVX vectors were introduced only in AVX2.

20054

if (Subtarget.hasAVX2())

20055

return Op;

20056

20057

return SDValue();

20058

20059

case MVT::v8i16:

20060

case MVT::v16i16: {

20061

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

20062

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

20063

Cond = DAG.getBitcast(CastVT, Cond);

20064

LHS = DAG.getBitcast(CastVT, LHS);

20065

RHS = DAG.getBitcast(CastVT, RHS);

20066

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

20067

return DAG.getBitcast(VT, Select);

20068

}

20069

}

20070

}

20071

20072

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

20073

MVT VT = Op.getSimpleValueType();

20074

SDValue Vec = Op.getOperand(0);

20075

SDValue Idx = Op.getOperand(1);

20076

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20076, __extension__
__PRETTY_FUNCTION__));

20077

SDLoc dl(Op);

20078

20079

if (!Vec.getSimpleValueType().is128BitVector())

20080

return SDValue();

20081

20082

if (VT.getSizeInBits() == 8) {

20083

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

20084

// we're going to zero extend the register or fold the store.

20085

if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

20086

!X86::mayFoldIntoStore(Op))

20087

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

20088

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20089

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20090

20091

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

20092

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

20093

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20094

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20095

}

20096

20097

if (VT == MVT::f32) {

20098

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

20099

// the result back to FR32 register. It's only worth matching if the

20100

// result has a single use which is a store or a bitcast to i32. And in

20101

// the case of a store, it's not worth it if the index is a constant 0,

20102

// because a MOVSSmr can be used instead, which is smaller and faster.

20103

if (!Op.hasOneUse())

20104

return SDValue();

20105

SDNode *User = *Op.getNode()->use_begin();

20106

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

20107

(User->getOpcode() != ISD::BITCAST ||

20108

User->getValueType(0) != MVT::i32))

20109

return SDValue();

20110

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20111

DAG.getBitcast(MVT::v4i32, Vec), Idx);

20112

return DAG.getBitcast(MVT::f32, Extract);

20113

}

20114

20115

if (VT == MVT::i32 || VT == MVT::i64)

20116

return Op;

20117

20118

return SDValue();

20119

}

20120

20121

/// Extract one bit from mask vector, like v16i1 or v8i1.

20122

/// AVX-512 feature.

20123

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

20124

const X86Subtarget &Subtarget) {

20125

SDValue Vec = Op.getOperand(0);

20126

SDLoc dl(Vec);

20127

MVT VecVT = Vec.getSimpleValueType();

20128

SDValue Idx = Op.getOperand(1);

20129

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20130

MVT EltVT = Op.getSimpleValueType();

20131

20132

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20133, __extension__
__PRETTY_FUNCTION__))

20133

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20133, __extension__
__PRETTY_FUNCTION__));

20134

20135

// variable index can't be handled in mask registers,

20136

// extend vector to VR512/128

20137

if (!IdxC) {

20138

unsigned NumElts = VecVT.getVectorNumElements();

20139

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

20140

// than extending to 128/256bit.

20141

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20142

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20143

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

20144

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

20145

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

20146

}

20147

20148

unsigned IdxVal = IdxC->getZExtValue();

20149

if (IdxVal == 0) // the operation is legal

20150

return Op;

20151

20152

// Extend to natively supported kshift.

20153

unsigned NumElems = VecVT.getVectorNumElements();

20154

MVT WideVecVT = VecVT;

20155

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20156

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20157

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20158

DAG.getUNDEF(WideVecVT), Vec,

20159

DAG.getIntPtrConstant(0, dl));

20160

}

20161

20162

// Use kshiftr instruction to move to the lower element.

20163

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20164

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20165

20166

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20167

DAG.getIntPtrConstant(0, dl));

20168

}

20169

20170

SDValue

20171

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

20172

SelectionDAG &DAG) const {

20173

SDLoc dl(Op);

20174

SDValue Vec = Op.getOperand(0);

20175

MVT VecVT = Vec.getSimpleValueType();

20176

SDValue Idx = Op.getOperand(1);

20177

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20178

20179

if (VecVT.getVectorElementType() == MVT::i1)

20180

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

20181

20182

if (!IdxC) {

20183

// Its more profitable to go through memory (1 cycles throughput)

20184

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

20185

// IACA tool was used to get performance estimation

20186

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

20187

//

20188

// example : extractelement <16 x i8> %a, i32 %i

20189

//

20190

// Block Throughput: 3.00 Cycles

20191

// Throughput Bottleneck: Port5

20192

//

20193

// | Num Of | Ports pressure in cycles | |

20194

// | Uops | 0 - DV | 5 | 6 | 7 | |

20195

// ---------------------------------------------

20196

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

20197

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

20198

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

20199

// Total Num Of Uops: 4

20200

//

20201

//

20202

// Block Throughput: 1.00 Cycles

20203

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

20204

//

20205

// | | Ports pressure in cycles | |

20206

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

20207

// ---------------------------------------------------------

20208

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

20209

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

20210

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

20211

// Total Num Of Uops: 4

20212

20213

return SDValue();

20214

}

20215

20216

unsigned IdxVal = IdxC->getZExtValue();

20217

20218

// If this is a 256-bit vector result, first extract the 128-bit vector and

20219

// then extract the element from the 128-bit vector.

20220

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

20221

// Get the 128-bit vector.

20222

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

20223

MVT EltVT = VecVT.getVectorElementType();

20224

20225

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

20226

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20226, __extension__
__PRETTY_FUNCTION__));

20227

20228

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

20229

// this can be done with a mask.

20230

IdxVal &= ElemsPerChunk - 1;

20231

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20232

DAG.getIntPtrConstant(IdxVal, dl));

20233

}

20234

20235

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20235, __extension__
__PRETTY_FUNCTION__));

20236

20237

MVT VT = Op.getSimpleValueType();

20238

20239

if (VT == MVT::i16) {

20240

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

20241

// we're going to zero extend the register or fold the store (SSE41 only).

20242

if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

20243

!(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

20244

if (Subtarget.hasFP16())

20245

return Op;

20246

20247

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

20248

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20249

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20250

}

20251

20252

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

20253

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20254

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20255

}

20256

20257

if (Subtarget.hasSSE41())

20258

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

20259

return Res;

20260

20261

// TODO: We only extract a single element from v16i8, we can probably afford

20262

// to be more aggressive here before using the default approach of spilling to

20263

// stack.

20264

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

20265

// Extract either the lowest i32 or any i16, and extract the sub-byte.

20266

int DWordIdx = IdxVal / 4;

20267

if (DWordIdx == 0) {

20268

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20269

DAG.getBitcast(MVT::v4i32, Vec),

20270

DAG.getIntPtrConstant(DWordIdx, dl));

20271

int ShiftVal = (IdxVal % 4) * 8;

20272

if (ShiftVal != 0)

20273

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

20274

DAG.getConstant(ShiftVal, dl, MVT::i8));

20275

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20276

}

20277

20278

int WordIdx = IdxVal / 2;

20279

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

20280

DAG.getBitcast(MVT::v8i16, Vec),

20281

DAG.getIntPtrConstant(WordIdx, dl));

20282

int ShiftVal = (IdxVal % 2) * 8;

20283

if (ShiftVal != 0)

20284

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

20285

DAG.getConstant(ShiftVal, dl, MVT::i8));

20286

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20287

}

20288

20289

if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

20290

if (IdxVal == 0)

20291

return Op;

20292

20293

// Shuffle the element to the lowest element, then movss or movsh.

20294

SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

20295

Mask[0] = static_cast<int>(IdxVal);

20296

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20297

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20298

DAG.getIntPtrConstant(0, dl));

20299

}

20300

20301

if (VT.getSizeInBits() == 64) {

20302

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

20303

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

20304

// to match extract_elt for f64.

20305

if (IdxVal == 0)

20306

return Op;

20307

20308

// UNPCKHPD the element to the lowest double word, then movsd.

20309

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

20310

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

20311

int Mask[2] = { 1, -1 };

20312

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20313

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20314

DAG.getIntPtrConstant(0, dl));

20315

}

20316

20317

return SDValue();

20318

}

20319

20320

/// Insert one bit to mask vector, like v16i1 or v8i1.

20321

/// AVX-512 feature.

20322

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

20323

const X86Subtarget &Subtarget) {

20324

SDLoc dl(Op);

20325

SDValue Vec = Op.getOperand(0);

20326

SDValue Elt = Op.getOperand(1);

20327

SDValue Idx = Op.getOperand(2);

20328

MVT VecVT = Vec.getSimpleValueType();

20329

20330

if (!isa<ConstantSDNode>(Idx)) {

20331

// Non constant index. Extend source and destination,

20332

// insert element and then truncate the result.

20333

unsigned NumElts = VecVT.getVectorNumElements();

20334

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20335

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20336

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

20337

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

20338

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

20339

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

20340

}

20341

20342

// Copy into a k-register, extract to v1i1 and insert_subvector.

20343

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

20344

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

20345

}

20346

20347

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

20348

SelectionDAG &DAG) const {

20349

MVT VT = Op.getSimpleValueType();

20350

MVT EltVT = VT.getVectorElementType();

20351

unsigned NumElts = VT.getVectorNumElements();

20352

unsigned EltSizeInBits = EltVT.getScalarSizeInBits();

20353

20354

if (EltVT == MVT::i1)

20355

return InsertBitToMaskVector(Op, DAG, Subtarget);

20356

20357

SDLoc dl(Op);

20358

SDValue N0 = Op.getOperand(0);

20359

SDValue N1 = Op.getOperand(1);

20360

SDValue N2 = Op.getOperand(2);

20361

auto *N2C = dyn_cast<ConstantSDNode>(N2);

20362

20363

if (!N2C) {

20364

// Variable insertion indices, usually we're better off spilling to stack,

20365

// but AVX512 can use a variable compare+select by comparing against all

20366

// possible vector indices, and FP insertion has less gpr->simd traffic.

20367

if (!(Subtarget.hasBWI() ||

20368

(Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

20369

(Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))

20370

return SDValue();

20371

20372

MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

20373

MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

20374

if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

20375

return SDValue();

20376

20377

SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

20378

SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

20379

SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

20380

20381

SmallVector<SDValue, 16> RawIndices;

20382

for (unsigned I = 0; I != NumElts; ++I)

20383

RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

20384

SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

20385

20386

// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

20387

return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

20388

ISD::CondCode::SETEQ);

20389

}

20390

20391

if (N2C->getAPIntValue().uge(NumElts))

20392

return SDValue();

20393

uint64_t IdxVal = N2C->getZExtValue();

20394

20395

bool IsZeroElt = X86::isZeroNode(N1);

20396

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

20397

20398

if (IsZeroElt || IsAllOnesElt) {

20399

// Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.

20400

// We don't deal with i8 0 since it appears to be handled elsewhere.

20401

if (IsAllOnesElt &&

20402

((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||

20403

((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {

20404

SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

20405

SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

20406

SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

20407

CstVectorElts[IdxVal] = OnesCst;

20408

SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

20409

return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

20410

}

20411

// See if we can do this more efficiently with a blend shuffle with a

20412

// rematerializable vector.

20413

if (Subtarget.hasSSE41() &&

20414

(EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

20415

SmallVector<int, 8> BlendMask;

20416

for (unsigned i = 0; i != NumElts; ++i)

20417

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20418

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

20419

: getOnesVector(VT, DAG, dl);

20420

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

20421

}

20422

}

20423

20424

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

20425

// into that, and then insert the subvector back into the result.

20426

if (VT.is256BitVector() || VT.is512BitVector()) {

20427

// With a 256-bit vector, we can insert into the zero element efficiently

20428

// using a blend if we have AVX or AVX2 and the right data type.

20429

if (VT.is256BitVector() && IdxVal == 0) {

20430

// TODO: It is worthwhile to cast integer to floating point and back

20431

// and incur a domain crossing penalty if that's what we'll end up

20432

// doing anyway after extracting to a 128-bit vector.

20433

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

20434

(Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {

20435

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20436

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

20437

DAG.getTargetConstant(1, dl, MVT::i8));

20438

}

20439

}

20440

20441

unsigned NumEltsIn128 = 128 / EltSizeInBits;

20442

assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20443, __extension__
__PRETTY_FUNCTION__))

20443

"Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20443, __extension__
__PRETTY_FUNCTION__));

20444

20445

// If we are not inserting into the low 128-bit vector chunk,

20446

// then prefer the broadcast+blend sequence.

20447

// FIXME: relax the profitability check iff all N1 uses are insertions.

20448

if (IdxVal >= NumEltsIn128 &&

20449

((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

20450

(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

20451

X86::mayFoldLoad(N1, Subtarget)))) {

20452

SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

20453

SmallVector<int, 8> BlendMask;

20454

for (unsigned i = 0; i != NumElts; ++i)

20455

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20456

return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

20457

}

20458

20459

// Get the desired 128-bit vector chunk.

20460

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

20461

20462

// Insert the element into the desired chunk.

20463

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

20464

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

20465

20466

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

20467

DAG.getIntPtrConstant(IdxIn128, dl));

20468

20469

// Insert the changed part back into the bigger vector

20470

return insert128BitVector(N0, V, IdxVal, DAG, dl);

20471

}

20472

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20472, __extension__
__PRETTY_FUNCTION__));

20473

20474

// This will be just movw/movd/movq/movsh/movss/movsd.

20475

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

20476

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

20477

EltVT == MVT::f16 || EltVT == MVT::i64) {

20478

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20479

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20480

}

20481

20482

// We can't directly insert an i8 or i16 into a vector, so zero extend

20483

// it to i32 first.

20484

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

20485

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

20486

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

20487

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

20488

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20489

return DAG.getBitcast(VT, N1);

20490

}

20491

}

20492

20493

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

20494

// argument. SSE41 required for pinsrb.

20495

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

20496

unsigned Opc;

20497

if (VT == MVT::v8i16) {

20498

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20498, __extension__
__PRETTY_FUNCTION__));

20499

Opc = X86ISD::PINSRW;

20500

} else {

20501

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20501, __extension__
__PRETTY_FUNCTION__));

20502

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20502, __extension__
__PRETTY_FUNCTION__));

20503

Opc = X86ISD::PINSRB;

20504

}

20505

20506

assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20506, __extension__
__PRETTY_FUNCTION__));

20507

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

20508

N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

20509

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

20510

}

20511

20512

if (Subtarget.hasSSE41()) {

20513

if (EltVT == MVT::f32) {

20514

// Bits [7:6] of the constant are the source select. This will always be

20515

// zero here. The DAG Combiner may combine an extract_elt index into

20516

// these bits. For example (insert (extract, 3), 2) could be matched by

20517

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

20518

// Bits [5:4] of the constant are the destination select. This is the

20519

// value of the incoming immediate.

20520

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

20521

// combine either bitwise AND or insert of float 0.0 to set these bits.

20522

20523

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

20524

if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

20525

// If this is an insertion of 32-bits into the low 32-bits of

20526

// a vector, we prefer to generate a blend with immediate rather

20527

// than an insertps. Blends are simpler operations in hardware and so

20528

// will always have equal or better performance than insertps.

20529

// But if optimizing for size and there's a load folding opportunity,

20530

// generate insertps because blendps does not have a 32-bit memory

20531

// operand form.

20532

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20533

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

20534

DAG.getTargetConstant(1, dl, MVT::i8));

20535

}

20536

// Create this as a scalar to vector..

20537

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20538

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

20539

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

20540

}

20541

20542

// PINSR* works with constant index.

20543

if (EltVT == MVT::i32 || EltVT == MVT::i64)

20544

return Op;

20545

}

20546

20547

return SDValue();

20548

}

20549

20550

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

20551

SelectionDAG &DAG) {

20552

SDLoc dl(Op);

20553

MVT OpVT = Op.getSimpleValueType();

20554

20555

// It's always cheaper to replace a xor+movd with xorps and simplifies further

20556

// combines.

20557

if (X86::isZeroNode(Op.getOperand(0)))

20558

return getZeroVector(OpVT, Subtarget, DAG, dl);

20559

20560

// If this is a 256-bit vector result, first insert into a 128-bit

20561

// vector and then insert into the 256-bit vector.

20562

if (!OpVT.is128BitVector()) {

20563

// Insert into a 128-bit vector.

20564

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

20565

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

20566

OpVT.getVectorNumElements() / SizeFactor);

20567

20568

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

20569

20570

// Insert the 128-bit vector.

20571

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

20572

}

20573

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20574, __extension__
__PRETTY_FUNCTION__))

20574

"Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20574, __extension__
__PRETTY_FUNCTION__));

20575

20576

// Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

20577

// tblgen.

20578

if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

20579

return Op;

20580

20581

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

20582

return DAG.getBitcast(

20583

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

20584

}

20585

20586

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

20587

// simple superregister reference or explicit instructions to insert

20588

// the upper bits of a vector.

20589

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20590

SelectionDAG &DAG) {

20591

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20591, __extension__
__PRETTY_FUNCTION__));

20592

20593

return insert1BitVector(Op, DAG, Subtarget);

20594

}

20595

20596

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20597

SelectionDAG &DAG) {

20598

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20599, __extension__
__PRETTY_FUNCTION__))

20599

"Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20599, __extension__
__PRETTY_FUNCTION__));

20600

20601

SDLoc dl(Op);

20602

SDValue Vec = Op.getOperand(0);

20603

uint64_t IdxVal = Op.getConstantOperandVal(1);

20604

20605

if (IdxVal == 0) // the operation is legal

20606

return Op;

20607

20608

MVT VecVT = Vec.getSimpleValueType();

20609

unsigned NumElems = VecVT.getVectorNumElements();

20610

20611

// Extend to natively supported kshift.

20612

MVT WideVecVT = VecVT;

20613

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20614

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20615

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20616

DAG.getUNDEF(WideVecVT), Vec,

20617

DAG.getIntPtrConstant(0, dl));

20618

}

20619

20620

// Shift to the LSB.

20621

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20622

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20623

20624

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

20625

DAG.getIntPtrConstant(0, dl));

20626

}

20627

20628

// Returns the appropriate wrapper opcode for a global reference.

20629

unsigned X86TargetLowering::getGlobalWrapperKind(

20630

const GlobalValue *GV, const unsigned char OpFlags) const {

20631

// References to absolute symbols are never PC-relative.

20632

if (GV && GV->isAbsoluteSymbolRef())

20633

return X86ISD::Wrapper;

20634

20635

CodeModel::Model M = getTargetMachine().getCodeModel();

20636

if (Subtarget.isPICStyleRIPRel() &&

20637

(M == CodeModel::Small || M == CodeModel::Kernel))

20638

return X86ISD::WrapperRIP;

20639

20640

// In the medium model, functions can always be referenced RIP-relatively,

20641

// since they must be within 2GiB. This is also possible in non-PIC mode, and

20642

// shorter than the 64-bit absolute immediate that would otherwise be emitted.

20643

if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))

20644

return X86ISD::WrapperRIP;

20645

20646

// GOTPCREL references must always use RIP.

20647

if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

20648

return X86ISD::WrapperRIP;

20649

20650

return X86ISD::Wrapper;

20651

}

20652

20653

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

20654

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

20655

// one of the above mentioned nodes. It has to be wrapped because otherwise

20656

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

20657

// be used to form addressing mode. These wrapped nodes will be selected

20658

// into MOV32ri.

20659

SDValue

20660

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

20661

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

20662

20663

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20664

// global base reg.

20665

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20666

20667

auto PtrVT = getPointerTy(DAG.getDataLayout());

20668

SDValue Result = DAG.getTargetConstantPool(

20669

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

20670

SDLoc DL(CP);

20671

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20672

// With PIC, the address is actually $g + Offset.

20673

if (OpFlag) {

20674

Result =

20675

DAG.getNode(ISD::ADD, DL, PtrVT,

20676

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20677

}

20678

20679

return Result;

20680

}

20681

20682

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

20683

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

20684

20685

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20686

// global base reg.

20687

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20688

20689

auto PtrVT = getPointerTy(DAG.getDataLayout());

20690

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

20691

SDLoc DL(JT);

20692

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20693

20694

// With PIC, the address is actually $g + Offset.

20695

if (OpFlag)

20696

Result =

20697

DAG.getNode(ISD::ADD, DL, PtrVT,

20698

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20699

20700

return Result;

20701

}

20702

20703

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

20704

SelectionDAG &DAG) const {

20705

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20706

}

20707

20708

SDValue

20709

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

20710

// Create the TargetBlockAddressAddress node.

20711

unsigned char OpFlags =

20712

Subtarget.classifyBlockAddressReference();

20713

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

20714

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

20715

SDLoc dl(Op);

20716

auto PtrVT = getPointerTy(DAG.getDataLayout());

20717

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

20718

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

20719

20720

// With PIC, the address is actually $g + Offset.

20721

if (isGlobalRelativeToPICBase(OpFlags)) {

20722

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20723

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20724

}

20725

20726

return Result;

20727

}

20728

20729

/// Creates target global address or external symbol nodes for calls or

20730

/// other uses.

20731

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

20732

bool ForCall) const {

20733

// Unpack the global address or external symbol.

20734

const SDLoc &dl = SDLoc(Op);

20735

const GlobalValue *GV = nullptr;

20736

int64_t Offset = 0;

20737

const char *ExternalSym = nullptr;

20738

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

20739

GV = G->getGlobal();

20740

Offset = G->getOffset();

20741

} else {

20742

const auto *ES = cast<ExternalSymbolSDNode>(Op);

20743

ExternalSym = ES->getSymbol();

20744

}

20745

20746

// Calculate some flags for address lowering.

20747

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

20748

unsigned char OpFlags;

20749

if (ForCall)

20750

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

20751

else

20752

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

20753

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

20754

bool NeedsLoad = isGlobalStubReference(OpFlags);

20755

20756

CodeModel::Model M = DAG.getTarget().getCodeModel();

20757

auto PtrVT = getPointerTy(DAG.getDataLayout());

20758

SDValue Result;

20759

20760

if (GV) {

20761

// Create a target global address if this is a global. If possible, fold the

20762

// offset into the global address reference. Otherwise, ADD it on later.

20763

// Suppress the folding if Offset is negative: movl foo-1, %eax is not

20764

// allowed because if the address of foo is 0, the ELF R_X86_64_32

20765

// relocation will compute to a negative value, which is invalid.

20766

int64_t GlobalOffset = 0;

20767

if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

20768

X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

20769

std::swap(GlobalOffset, Offset);

20770

}

20771

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

20772

} else {

20773

// If this is not a global address, this must be an external symbol.

20774

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

20775

}

20776

20777

// If this is a direct call, avoid the wrapper if we don't need to do any

20778

// loads or adds. This allows SDAG ISel to match direct calls.

20779

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

20780

return Result;

20781

20782

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

20783

20784

// With PIC, the address is actually $g + Offset.

20785

if (HasPICReg) {

20786

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20787

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20788

}

20789

20790

// For globals that require a load from a stub to get the address, emit the

20791

// load.

20792

if (NeedsLoad)

20793

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

20794

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20795

20796

// If there was a non-zero offset that we didn't fold, create an explicit

20797

// addition for it.

20798

if (Offset != 0)

20799

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

20800

DAG.getConstant(Offset, dl, PtrVT));

20801

20802

return Result;

20803

}

20804

20805

SDValue

20806

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

20807

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20808

}

20809

20810

static SDValue

20811

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

20812

SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,

20813

unsigned char OperandFlags, bool LocalDynamic = false) {

20814

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20815

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

20816

SDLoc dl(GA);

20817

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20818

GA->getValueType(0),

20819

GA->getOffset(),

20820

OperandFlags);

20821

20822

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

20823

: X86ISD::TLSADDR;

20824

20825

if (InFlag) {

20826

SDValue Ops[] = { Chain, TGA, *InFlag };

20827

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20828

} else {

20829

SDValue Ops[] = { Chain, TGA };

20830

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20831

}

20832

20833

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

20834

MFI.setAdjustsStack(true);

20835

MFI.setHasCalls(true);

20836

20837

SDValue Flag = Chain.getValue(1);

20838

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);

20839

}

20840

20841

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

20842

static SDValue

20843

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20844

const EVT PtrVT) {

20845

SDValue InFlag;

20846

SDLoc dl(GA); // ? function entry point might be better

20847

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

20848

DAG.getNode(X86ISD::GlobalBaseReg,

20849

SDLoc(), PtrVT), InFlag);

20850

InFlag = Chain.getValue(1);

20851

20852

return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);

20853

}

20854

20855

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

20856

static SDValue

20857

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20858

const EVT PtrVT) {

20859

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

20860

X86::RAX, X86II::MO_TLSGD);

20861

}

20862

20863

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

20864

static SDValue

20865

LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20866

const EVT PtrVT) {

20867

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

20868

X86::EAX, X86II::MO_TLSGD);

20869

}

20870

20871

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

20872

SelectionDAG &DAG, const EVT PtrVT,

20873

bool Is64Bit, bool Is64BitLP64) {

20874

SDLoc dl(GA);

20875

20876

// Get the start address of the TLS block for this module.

20877

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

20878

.getInfo<X86MachineFunctionInfo>();

20879

MFI->incNumLocalDynamicTLSAccesses();

20880

20881

SDValue Base;

20882

if (Is64Bit) {

20883

unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

20884

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,

20885

X86II::MO_TLSLD, /*LocalDynamic=*/true);

20886

} else {

20887

SDValue InFlag;

20888

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

20889

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);

20890

InFlag = Chain.getValue(1);

20891

Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,

20892

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

20893

}

20894

20895

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

20896

// of Base.

20897

20898

// Build x@dtpoff.

20899

unsigned char OperandFlags = X86II::MO_DTPOFF;

20900

unsigned WrapperKind = X86ISD::Wrapper;

20901

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20902

GA->getValueType(0),

20903

GA->getOffset(), OperandFlags);

20904

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

20905

20906

// Add x@dtpoff with the base.

20907

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

20908

}

20909

20910

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

20911

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20912

const EVT PtrVT, TLSModel::Model model,

20913

bool is64Bit, bool isPIC) {

20914

SDLoc dl(GA);

20915

20916

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

20917

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

20918

is64Bit ? 257 : 256));

20919

20920

SDValue ThreadPointer =

20921

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

20922

MachinePointerInfo(Ptr));

20923

20924

unsigned char OperandFlags = 0;

20925

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

20926

// initialexec.

20927

unsigned WrapperKind = X86ISD::Wrapper;

20928

if (model == TLSModel::LocalExec) {

20929

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

20930

} else if (model == TLSModel::InitialExec) {

20931

if (is64Bit) {

20932

OperandFlags = X86II::MO_GOTTPOFF;

20933

WrapperKind = X86ISD::WrapperRIP;

20934

} else {

20935

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

20936

}

20937

} else {

20938

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20938);

20939

}

20940

20941

// emit "addl x@ntpoff,%eax" (local exec)

20942

// or "addl x@indntpoff,%eax" (initial exec)

20943

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

20944

SDValue TGA =

20945

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

20946

GA->getOffset(), OperandFlags);

20947

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

20948

20949

if (model == TLSModel::InitialExec) {

20950

if (isPIC && !is64Bit) {

20951

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

20952

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

20953

Offset);

20954

}

20955

20956

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

20957

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20958

}

20959

20960

// The address of the thread local variable is the add of the thread

20961

// pointer with the offset of the variable.

20962

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

20963

}

20964

20965

SDValue

20966

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

20967

20968

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

20969

20970

if (DAG.getTarget().useEmulatedTLS())

20971

return LowerToTLSEmulatedModel(GA, DAG);

20972

20973

const GlobalValue *GV = GA->getGlobal();

20974

auto PtrVT = getPointerTy(DAG.getDataLayout());

20975

bool PositionIndependent = isPositionIndependent();

20976

20977

if (Subtarget.isTargetELF()) {

20978

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

20979

switch (model) {

20980

case TLSModel::GeneralDynamic:

20981

if (Subtarget.is64Bit()) {

20982

if (Subtarget.isTarget64BitLP64())

20983

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

20984

return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

20985

}

20986

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

20987

case TLSModel::LocalDynamic:

20988

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

20989

Subtarget.isTarget64BitLP64());

20990

case TLSModel::InitialExec:

20991

case TLSModel::LocalExec:

20992

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

20993

PositionIndependent);

20994

}

20995

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20995);

20996

}

20997

20998

if (Subtarget.isTargetDarwin()) {

20999

// Darwin only has one model of TLS. Lower to that.

21000

unsigned char OpFlag = 0;

21001

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

21002

X86ISD::WrapperRIP : X86ISD::Wrapper;

21003

21004

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

21005

// global base reg.

21006

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

21007

if (PIC32)

21008

OpFlag = X86II::MO_TLVP_PIC_BASE;

21009

else

21010

OpFlag = X86II::MO_TLVP;

21011

SDLoc DL(Op);

21012

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

21013

GA->getValueType(0),

21014

GA->getOffset(), OpFlag);

21015

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

21016

21017

// With PIC32, the address is actually $g + Offset.

21018

if (PIC32)

21019

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

21020

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21021

Offset);

21022

21023

// Lowering the machine isd will make sure everything is in the right

21024

// location.

21025

SDValue Chain = DAG.getEntryNode();

21026

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

21027

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

21028

SDValue Args[] = { Chain, Offset };

21029

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

21030

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);

21031

21032

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

21033

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21034

MFI.setAdjustsStack(true);

21035

21036

// And our return value (tls address) is in the standard call return value

21037

// location.

21038

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

21039

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

21040

}

21041

21042

if (Subtarget.isOSWindows()) {

21043

// Just use the implicit TLS architecture

21044

// Need to generate something similar to:

21045

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

21046

// ; from TEB

21047

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

21048

// mov rcx, qword [rdx+rcx*8]

21049

// mov eax, .tls$:tlsvar

21050

// [rax+rcx] contains the address

21051

// Windows 64bit: gs:0x58

21052

// Windows 32bit: fs:__tls_array

21053

21054

SDLoc dl(GA);

21055

SDValue Chain = DAG.getEntryNode();

21056

21057

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

21058

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

21059

// use its literal value of 0x2C.

21060

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

21061

? Type::getInt8PtrTy(*DAG.getContext(),

21062

256)

21063

: Type::getInt32PtrTy(*DAG.getContext(),

21064

257));

21065

21066

SDValue TlsArray = Subtarget.is64Bit()

21067

? DAG.getIntPtrConstant(0x58, dl)

21068

: (Subtarget.isTargetWindowsGNU()

21069

? DAG.getIntPtrConstant(0x2C, dl)

21070

: DAG.getExternalSymbol("_tls_array", PtrVT));

21071

21072

SDValue ThreadPointer =

21073

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

21074

21075

SDValue res;

21076

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

21077

res = ThreadPointer;

21078

} else {

21079

// Load the _tls_index variable

21080

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

21081

if (Subtarget.is64Bit())

21082

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

21083

MachinePointerInfo(), MVT::i32);

21084

else

21085

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

21086

21087

const DataLayout &DL = DAG.getDataLayout();

21088

SDValue Scale =

21089

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

21090

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

21091

21092

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

21093

}

21094

21095

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

21096

21097

// Get the offset of start of .tls section

21098

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21099

GA->getValueType(0),

21100

GA->getOffset(), X86II::MO_SECREL);

21101

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

21102

21103

// The address of the thread local variable is the add of the thread

21104

// pointer with the offset of the variable.

21105

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

21106

}

21107

21108

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21108);

21109

}

21110

21111

/// Lower SRA_PARTS and friends, which return two i32 values

21112

/// and take a 2 x i32 value to shift plus a shift amount.

21113

/// TODO: Can this be moved to general expansion code?

21114

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

21115

SDValue Lo, Hi;

21116

DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

21117

return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

21118

}

21119

21120

// Try to use a packed vector operation to handle i64 on 32-bit targets when

21121

// AVX512DQ is enabled.

21122

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

21123

const X86Subtarget &Subtarget) {

21124

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21128, __extension__
__PRETTY_FUNCTION__))

21125

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21128, __extension__
__PRETTY_FUNCTION__))

21126

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21128, __extension__
__PRETTY_FUNCTION__))

21127

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21128, __extension__
__PRETTY_FUNCTION__))

21128

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21128, __extension__
__PRETTY_FUNCTION__));

21129

bool IsStrict = Op->isStrictFPOpcode();

21130

unsigned OpNo = IsStrict ? 1 : 0;

21131

SDValue Src = Op.getOperand(OpNo);

21132

MVT SrcVT = Src.getSimpleValueType();

21133

MVT VT = Op.getSimpleValueType();

21134

21135

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

21136

(VT != MVT::f32 && VT != MVT::f64))

21137

return SDValue();

21138

21139

// Pack the i64 into a vector, do the operation and extract.

21140

21141

// Using 256-bit to ensure result is 128-bits for f32 case.

21142

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

21143

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

21144

MVT VecVT = MVT::getVectorVT(VT, NumElts);

21145

21146

SDLoc dl(Op);

21147

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

21148

if (IsStrict) {

21149

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

21150

{Op.getOperand(0), InVec});

21151

SDValue Chain = CvtVec.getValue(1);

21152

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21153

DAG.getIntPtrConstant(0, dl));

21154

return DAG.getMergeValues({Value, Chain}, dl);

21155

}

21156

21157

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

21158

21159

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21160

DAG.getIntPtrConstant(0, dl));

21161

}

21162

21163

// Try to use a packed vector operation to handle i64 on 32-bit targets.

21164

static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,

21165

const X86Subtarget &Subtarget) {

21166

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21170, __extension__
__PRETTY_FUNCTION__))

21167

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21170, __extension__
__PRETTY_FUNCTION__))

21168

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21170, __extension__
__PRETTY_FUNCTION__))

21169

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21170, __extension__
__PRETTY_FUNCTION__))

21170

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21170, __extension__
__PRETTY_FUNCTION__));

21171

bool IsStrict = Op->isStrictFPOpcode();

21172

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21173

MVT SrcVT = Src.getSimpleValueType();

21174

MVT VT = Op.getSimpleValueType();

21175

21176

if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

21177

return SDValue();

21178

21179

// Pack the i64 into a vector, do the operation and extract.

21180

21181

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21181, __extension__
__PRETTY_FUNCTION__));

21182

21183

SDLoc dl(Op);

21184

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

21185

if (IsStrict) {

21186

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

21187

{Op.getOperand(0), InVec});

21188

SDValue Chain = CvtVec.getValue(1);

21189

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21190

DAG.getIntPtrConstant(0, dl));

21191

return DAG.getMergeValues({Value, Chain}, dl);

21192

}

21193

21194

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);

21195

21196

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21197

DAG.getIntPtrConstant(0, dl));

21198

}

21199

21200

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

21201

const X86Subtarget &Subtarget) {

21202

switch (Opcode) {

21203

case ISD::SINT_TO_FP:

21204

// TODO: Handle wider types with AVX/AVX512.

21205

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

21206

return false;

21207

// CVTDQ2PS or (V)CVTDQ2PD

21208

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

21209

21210

case ISD::UINT_TO_FP:

21211

// TODO: Handle wider types and i64 elements.

21212

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

21213

return false;

21214

// VCVTUDQ2PS or VCVTUDQ2PD

21215

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

21216

21217

default:

21218

return false;

21219

}

21220

}

21221

21222

/// Given a scalar cast operation that is extracted from a vector, try to

21223

/// vectorize the cast op followed by extraction. This will avoid an expensive

21224

/// round-trip between XMM and GPR.

21225

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

21226

const X86Subtarget &Subtarget) {

21227

// TODO: This could be enhanced to handle smaller integer types by peeking

21228

// through an extend.

21229

SDValue Extract = Cast.getOperand(0);

21230

MVT DestVT = Cast.getSimpleValueType();

21231

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21232

!isa<ConstantSDNode>(Extract.getOperand(1)))

21233

return SDValue();

21234

21235

// See if we have a 128-bit vector cast op for this type of cast.

21236

SDValue VecOp = Extract.getOperand(0);

21237

MVT FromVT = VecOp.getSimpleValueType();

21238

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

21239

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

21240

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

21241

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

21242

return SDValue();

21243

21244

// If we are extracting from a non-zero element, first shuffle the source

21245

// vector to allow extracting from element zero.

21246

SDLoc DL(Cast);

21247

if (!isNullConstant(Extract.getOperand(1))) {

21248

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

21249

Mask[0] = Extract.getConstantOperandVal(1);

21250

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

21251

}

21252

// If the source vector is wider than 128-bits, extract the low part. Do not

21253

// create an unnecessarily wide vector cast op.

21254

if (FromVT != Vec128VT)

21255

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

21256

21257

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

21258

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

21259

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

21260

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

21261

DAG.getIntPtrConstant(0, DL));

21262

}

21263

21264

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

21265

/// try to vectorize the cast ops. This will avoid an expensive round-trip

21266

/// between XMM and GPR.

21267

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

21268

const X86Subtarget &Subtarget) {

21269

// TODO: Allow FP_TO_UINT.

21270

SDValue CastToInt = CastToFP.getOperand(0);

21271

MVT VT = CastToFP.getSimpleValueType();

21272

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

21273

return SDValue();

21274

21275

MVT IntVT = CastToInt.getSimpleValueType();

21276

SDValue X = CastToInt.getOperand(0);

21277

MVT SrcVT = X.getSimpleValueType();

21278

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

21279

return SDValue();

21280

21281

// See if we have 128-bit vector cast instructions for this type of cast.

21282

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

21283

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

21284

IntVT != MVT::i32)

21285

return SDValue();

21286

21287

unsigned SrcSize = SrcVT.getSizeInBits();

21288

unsigned IntSize = IntVT.getSizeInBits();

21289

unsigned VTSize = VT.getSizeInBits();

21290

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

21291

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

21292

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

21293

21294

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

21295

unsigned ToIntOpcode =

21296

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

21297

unsigned ToFPOpcode =

21298

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

21299

21300

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

21301

//

21302

// We are not defining the high elements (for example, zero them) because

21303

// that could nullify any performance advantage that we hoped to gain from

21304

// this vector op hack. We do not expect any adverse effects (like denorm

21305

// penalties) with cast ops.

21306

SDLoc DL(CastToFP);

21307

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

21308

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

21309

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

21310

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

21311

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

21312

}

21313

21314

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

21315

const X86Subtarget &Subtarget) {

21316

SDLoc DL(Op);

21317

bool IsStrict = Op->isStrictFPOpcode();

21318

MVT VT = Op->getSimpleValueType(0);

21319

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

21320

21321

if (Subtarget.hasDQI()) {

21322

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21322, __extension__
__PRETTY_FUNCTION__));

21323

21324

assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21326, __extension__
__PRETTY_FUNCTION__))

21325

Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21326, __extension__
__PRETTY_FUNCTION__))

21326

"Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21326, __extension__
__PRETTY_FUNCTION__));

21327

21328

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

21329

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21330, __extension__
__PRETTY_FUNCTION__))

21330

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21330, __extension__
__PRETTY_FUNCTION__));

21331

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

21332

21333

// Need to concat with zero vector for strict fp to avoid spurious

21334

// exceptions.

21335

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

21336

: DAG.getUNDEF(MVT::v8i64);

21337

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

21338

DAG.getIntPtrConstant(0, DL));

21339

SDValue Res, Chain;

21340

if (IsStrict) {

21341

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

21342

{Op->getOperand(0), Src});

21343

Chain = Res.getValue(1);

21344

} else {

21345

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

21346

}

21347

21348

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21349

DAG.getIntPtrConstant(0, DL));

21350

21351

if (IsStrict)

21352

return DAG.getMergeValues({Res, Chain}, DL);

21353

return Res;

21354

}

21355

21356

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

21357

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

21358

if (VT != MVT::v4f32 || IsSigned)

21359

return SDValue();

21360

21361

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

21362

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

21363

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

21364

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

21365

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

21366

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

21367

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

21368

SmallVector<SDValue, 4> SignCvts(4);

21369

SmallVector<SDValue, 4> Chains(4);

21370

for (int i = 0; i != 4; ++i) {

21371

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

21372

DAG.getIntPtrConstant(i, DL));

21373

if (IsStrict) {

21374

SignCvts[i] =

21375

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

21376

{Op.getOperand(0), Elt});

21377

Chains[i] = SignCvts[i].getValue(1);

21378

} else {

21379

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

21380

}

21381

}

21382

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

21383

21384

SDValue Slow, Chain;

21385

if (IsStrict) {

21386

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

21387

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

21388

{Chain, SignCvt, SignCvt});

21389

Chain = Slow.getValue(1);

21390

} else {

21391

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

21392

}

21393

21394

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

21395

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

21396

21397

if (IsStrict)

21398

return DAG.getMergeValues({Cvt, Chain}, DL);

21399

21400

return Cvt;

21401

}

21402

21403

static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {

21404

bool IsStrict = Op->isStrictFPOpcode();

21405

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21406

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21407

MVT VT = Op.getSimpleValueType();

21408

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

21409

SDLoc dl(Op);

21410

21411

SDValue Rnd = DAG.getIntPtrConstant(0, dl);

21412

if (IsStrict)

21413

return DAG.getNode(

21414

ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},

21415

{Chain,

21416

DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),

21417

Rnd});

21418

return DAG.getNode(ISD::FP_ROUND, dl, VT,

21419

DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);

21420

}

21421

21422

static bool isLegalConversion(MVT VT, bool IsSigned,

21423

const X86Subtarget &Subtarget) {

21424

if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)

21425

return true;

21426

if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)

21427

return true;

21428

if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))

21429

return true;

21430

if (Subtarget.useAVX512Regs()) {

21431

if (VT == MVT::v16i32)

21432

return true;

21433

if (VT == MVT::v8i64 && Subtarget.hasDQI())

21434

return true;

21435

}

21436

if (Subtarget.hasDQI() && Subtarget.hasVLX() &&

21437

(VT == MVT::v2i64 || VT == MVT::v4i64))

21438

return true;

21439

return false;

21440

}

21441

21442

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

21443

SelectionDAG &DAG) const {

21444

bool IsStrict = Op->isStrictFPOpcode();

21445

unsigned OpNo = IsStrict ? 1 : 0;

21446

SDValue Src = Op.getOperand(OpNo);

21447

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21448

MVT SrcVT = Src.getSimpleValueType();

21449

MVT VT = Op.getSimpleValueType();

21450

SDLoc dl(Op);

21451

21452

if (isSoftFP16(VT))

21453

return promoteXINT_TO_FP(Op, DAG);

21454

else if (isLegalConversion(SrcVT, true, Subtarget))

21455

return Op;

21456

21457

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21458

return LowerWin64_INT128_TO_FP(Op, DAG);

21459

21460

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21461

return Extract;

21462

21463

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

21464

return R;

21465

21466

if (SrcVT.isVector()) {

21467

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

21468

// Note: Since v2f64 is a legal type. We don't need to zero extend the

21469

// source for strict FP.

21470

if (IsStrict)

21471

return DAG.getNode(

21472

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

21473

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21474

DAG.getUNDEF(SrcVT))});

21475

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

21476

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21477

DAG.getUNDEF(SrcVT)));

21478

}

21479

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

21480

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21481

21482

return SDValue();

21483

}

21484

21485

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21486, __extension__
__PRETTY_FUNCTION__))

21486

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21486, __extension__
__PRETTY_FUNCTION__));

21487

21488

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

21489

21490

// These are really Legal; return the operand so the caller accepts it as

21491

// Legal.

21492

if (SrcVT == MVT::i32 && UseSSEReg)

21493

return Op;

21494

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

21495

return Op;

21496

21497

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21498

return V;

21499

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21500

return V;

21501

21502

// SSE doesn't have an i16 conversion so we need to promote.

21503

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

21504

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

21505

if (IsStrict)

21506

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

21507

{Chain, Ext});

21508

21509

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

21510

}

21511

21512

if (VT == MVT::f128 || !Subtarget.hasX87())

21513

return SDValue();

21514

21515

SDValue ValueToStore = Src;

21516

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

21517

// Bitcasting to f64 here allows us to do a single 64-bit store from

21518

// an SSE register, avoiding the store forwarding penalty that would come

21519

// with two 32-bit stores.

21520

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21521

21522

unsigned Size = SrcVT.getStoreSize();

21523

Align Alignment(Size);

21524

MachineFunction &MF = DAG.getMachineFunction();

21525

auto PtrVT = getPointerTy(MF.getDataLayout());

21526

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

21527

MachinePointerInfo MPI =

21528

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21529

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21530

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

21531

std::pair<SDValue, SDValue> Tmp =

21532

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

21533

21534

if (IsStrict)

21535

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21536

21537

return Tmp.first;

21538

}

21539

21540

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

21541

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

21542

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

21543

// Build the FILD

21544

SDVTList Tys;

21545

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

21546

if (useSSE)

21547

Tys = DAG.getVTList(MVT::f80, MVT::Other);

21548

else

21549

Tys = DAG.getVTList(DstVT, MVT::Other);

21550

21551

SDValue FILDOps[] = {Chain, Pointer};

21552

SDValue Result =

21553

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

21554

Alignment, MachineMemOperand::MOLoad);

21555

Chain = Result.getValue(1);

21556

21557

if (useSSE) {

21558

MachineFunction &MF = DAG.getMachineFunction();

21559

unsigned SSFISize = DstVT.getStoreSize();

21560

int SSFI =

21561

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

21562

auto PtrVT = getPointerTy(MF.getDataLayout());

21563

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21564

Tys = DAG.getVTList(MVT::Other);

21565

SDValue FSTOps[] = {Chain, Result, StackSlot};

21566

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

21567

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

21568

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

21569

21570

Chain =

21571

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

21572

Result = DAG.getLoad(

21573

DstVT, DL, Chain, StackSlot,

21574

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

21575

Chain = Result.getValue(1);

21576

}

21577

21578

return { Result, Chain };

21579

}

21580

21581

/// Horizontal vector math instructions may be slower than normal math with

21582

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

21583

/// implementation, and likely shuffle complexity of the alternate sequence.

21584

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

21585

const X86Subtarget &Subtarget) {

21586

bool IsOptimizingSize = DAG.shouldOptForSize();

21587

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

21588

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

21589

}

21590

21591

/// 64-bit unsigned integer to double expansion.

21592

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

21593

const X86Subtarget &Subtarget) {

21594

// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

21595

// when converting 0 when rounding toward negative infinity. Caller will

21596

// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

21597

assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21597, __extension__
__PRETTY_FUNCTION__));

21598

// This algorithm is not obvious. Here it is what we're trying to output:

21599

/*

21600

movq %rax, %xmm0

21601

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

21602

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

21603

#ifdef __SSE3__

21604

haddpd %xmm0, %xmm0

21605

#else

21606

pshufd $0x4e, %xmm0, %xmm1

21607

addpd %xmm1, %xmm0

21608

#endif

21609

*/

21610

21611

SDLoc dl(Op);

21612

LLVMContext *Context = DAG.getContext();

21613

21614

// Build some magic constants.

21615

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

21616

Constant *C0 = ConstantDataVector::get(*Context, CV0);

21617

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21618

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

21619

21620

SmallVector<Constant*,2> CV1;

21621

CV1.push_back(

21622

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21623

APInt(64, 0x4330000000000000ULL))));

21624

CV1.push_back(

21625

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21626

APInt(64, 0x4530000000000000ULL))));

21627

Constant *C1 = ConstantVector::get(CV1);

21628

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

21629

21630

// Load the 64-bit value into an XMM register.

21631

SDValue XR1 =

21632

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

21633

SDValue CLod0 = DAG.getLoad(

21634

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

21635

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21636

SDValue Unpck1 =

21637

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

21638

21639

SDValue CLod1 = DAG.getLoad(

21640

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

21641

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21642

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

21643

// TODO: Are there any fast-math-flags to propagate here?

21644

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

21645

SDValue Result;

21646

21647

if (Subtarget.hasSSE3() &&

21648

shouldUseHorizontalOp(true, DAG, Subtarget)) {

21649

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

21650

} else {

21651

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

21652

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

21653

}

21654

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

21655

DAG.getIntPtrConstant(0, dl));

21656

return Result;

21657

}

21658

21659

/// 32-bit unsigned integer to float expansion.

21660

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

21661

const X86Subtarget &Subtarget) {

21662

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21663

SDLoc dl(Op);

21664

// FP constant to bias correct the final result.

21665

SDValue Bias = DAG.getConstantFP(

21666

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);

21667

21668

// Load the 32-bit value into an XMM register.

21669

SDValue Load =

21670

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

21671

21672

// Zero out the upper parts of the register.

21673

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

21674

21675

// Or the load with the bias.

21676

SDValue Or = DAG.getNode(

21677

ISD::OR, dl, MVT::v2i64,

21678

DAG.getBitcast(MVT::v2i64, Load),

21679

DAG.getBitcast(MVT::v2i64,

21680

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

21681

Or =

21682

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

21683

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

21684

21685

if (Op.getNode()->isStrictFPOpcode()) {

21686

// Subtract the bias.

21687

// TODO: Are there any fast-math-flags to propagate here?

21688

SDValue Chain = Op.getOperand(0);

21689

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

21690

{Chain, Or, Bias});

21691

21692

if (Op.getValueType() == Sub.getValueType())

21693

return Sub;

21694

21695

// Handle final rounding.

21696

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

21697

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

21698

21699

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

21700

}

21701

21702

// Subtract the bias.

21703

// TODO: Are there any fast-math-flags to propagate here?

21704

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

21705

21706

// Handle final rounding.

21707

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

21708

}

21709

21710

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

21711

const X86Subtarget &Subtarget,

21712

const SDLoc &DL) {

21713

if (Op.getSimpleValueType() != MVT::v2f64)

21714

return SDValue();

21715

21716

bool IsStrict = Op->isStrictFPOpcode();

21717

21718

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

21719

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21719, __extension__
__PRETTY_FUNCTION__));

21720

21721

if (Subtarget.hasAVX512()) {

21722

if (!Subtarget.hasVLX()) {

21723

// Let generic type legalization widen this.

21724

if (!IsStrict)

21725

return SDValue();

21726

// Otherwise pad the integer input with 0s and widen the operation.

21727

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21728

DAG.getConstant(0, DL, MVT::v2i32));

21729

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

21730

{Op.getOperand(0), N0});

21731

SDValue Chain = Res.getValue(1);

21732

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

21733

DAG.getIntPtrConstant(0, DL));

21734

return DAG.getMergeValues({Res, Chain}, DL);

21735

}

21736

21737

// Legalize to v4i32 type.

21738

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21739

DAG.getUNDEF(MVT::v2i32));

21740

if (IsStrict)

21741

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

21742

{Op.getOperand(0), N0});

21743

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

21744

}

21745

21746

// Zero extend to 2i64, OR with the floating point representation of 2^52.

21747

// This gives us the floating point equivalent of 2^52 + the i32 integer

21748

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

21749

// point leaving just our i32 integers in double format.

21750

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

21751

SDValue VBias = DAG.getConstantFP(

21752

llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);

21753

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

21754

DAG.getBitcast(MVT::v2i64, VBias));

21755

Or = DAG.getBitcast(MVT::v2f64, Or);

21756

21757

if (IsStrict)

21758

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

21759

{Op.getOperand(0), Or, VBias});

21760

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

21761

}

21762

21763

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

21764

const X86Subtarget &Subtarget) {

21765

SDLoc DL(Op);

21766

bool IsStrict = Op->isStrictFPOpcode();

21767

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

21768

MVT VecIntVT = V.getSimpleValueType();

21769

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21770, __extension__
__PRETTY_FUNCTION__))

21770

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21770, __extension__
__PRETTY_FUNCTION__));

21771

21772

if (Subtarget.hasAVX512()) {

21773

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

21774

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21774, __extension__
__PRETTY_FUNCTION__));

21775

MVT VT = Op->getSimpleValueType(0);

21776

21777

// v8i32->v8f64 is legal with AVX512 so just return it.

21778

if (VT == MVT::v8f64)

21779

return Op;

21780

21781

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21782, __extension__
__PRETTY_FUNCTION__))

21782

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21782, __extension__
__PRETTY_FUNCTION__));

21783

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

21784

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

21785

// Need to concat with zero vector for strict fp to avoid spurious

21786

// exceptions.

21787

SDValue Tmp =

21788

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

21789

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

21790

DAG.getIntPtrConstant(0, DL));

21791

SDValue Res, Chain;

21792

if (IsStrict) {

21793

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

21794

{Op->getOperand(0), V});

21795

Chain = Res.getValue(1);

21796

} else {

21797

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

21798

}

21799

21800

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21801

DAG.getIntPtrConstant(0, DL));

21802

21803

if (IsStrict)

21804

return DAG.getMergeValues({Res, Chain}, DL);

21805

return Res;

21806

}

21807

21808

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

21809

Op->getSimpleValueType(0) == MVT::v4f64) {

21810

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

21811

Constant *Bias = ConstantFP::get(

21812

*DAG.getContext(),

21813

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

21814

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21815

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

21816

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

21817

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

21818

SDValue VBias = DAG.getMemIntrinsicNode(

21819

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

21820

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

21821

MachineMemOperand::MOLoad);

21822

21823

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

21824

DAG.getBitcast(MVT::v4i64, VBias));

21825

Or = DAG.getBitcast(MVT::v4f64, Or);

21826

21827

if (IsStrict)

21828

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

21829

{Op.getOperand(0), Or, VBias});

21830

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

21831

}

21832

21833

// The algorithm is the following:

21834

// #ifdef __SSE4_1__

21835

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21836

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21837

// (uint4) 0x53000000, 0xaa);

21838

// #else

21839

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21840

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21841

// #endif

21842

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

21843

// return (float4) lo + fhi;

21844

21845

bool Is128 = VecIntVT == MVT::v4i32;

21846

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

21847

// If we convert to something else than the supported type, e.g., to v4f64,

21848

// abort early.

21849

if (VecFloatVT != Op->getSimpleValueType(0))

21850

return SDValue();

21851

21852

// In the #idef/#else code, we have in common:

21853

// - The vector of constants:

21854

// -- 0x4b000000

21855

// -- 0x53000000

21856

// - A shift:

21857

// -- v >> 16

21858

21859

// Create the splat vector for 0x4b000000.

21860

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

21861

// Create the splat vector for 0x53000000.

21862

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

21863

21864

// Create the right shift.

21865

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

21866

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

21867

21868

SDValue Low, High;

21869

if (Subtarget.hasSSE41()) {

21870

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

21871

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21872

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

21873

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

21874

// Low will be bitcasted right away, so do not bother bitcasting back to its

21875

// original type.

21876

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

21877

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

21878

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21879

// (uint4) 0x53000000, 0xaa);

21880

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

21881

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

21882

// High will be bitcasted right away, so do not bother bitcasting back to

21883

// its original type.

21884

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

21885

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

21886

} else {

21887

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

21888

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21889

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

21890

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

21891

21892

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21893

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

21894

}

21895

21896

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

21897

SDValue VecCstFSub = DAG.getConstantFP(

21898

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

21899

21900

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

21901

// NOTE: By using fsub of a positive constant instead of fadd of a negative

21902

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

21903

// enabled. See PR24512.

21904

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

21905

// TODO: Are there any fast-math-flags to propagate here?

21906

// (float4) lo;

21907

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

21908

// return (float4) lo + fhi;

21909

if (IsStrict) {

21910

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

21911

{Op.getOperand(0), HighBitcast, VecCstFSub});

21912

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

21913

{FHigh.getValue(1), LowBitcast, FHigh});

21914

}

21915

21916

SDValue FHigh =

21917

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

21918

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

21919

}

21920

21921

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

21922

const X86Subtarget &Subtarget) {

21923

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21924

SDValue N0 = Op.getOperand(OpNo);

21925

MVT SrcVT = N0.getSimpleValueType();

21926

SDLoc dl(Op);

21927

21928

switch (SrcVT.SimpleTy) {

21929

default:

21930

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21930);

21931

case MVT::v2i32:

21932

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

21933

case MVT::v4i32:

21934

case MVT::v8i32:

21935

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

21936

case MVT::v2i64:

21937

case MVT::v4i64:

21938

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21939

}

21940

}

21941

21942

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

21943

SelectionDAG &DAG) const {

21944

bool IsStrict = Op->isStrictFPOpcode();

21945

unsigned OpNo = IsStrict ? 1 : 0;

21946

SDValue Src = Op.getOperand(OpNo);

21947

SDLoc dl(Op);

21948

auto PtrVT = getPointerTy(DAG.getDataLayout());

21949

MVT SrcVT = Src.getSimpleValueType();

21950

MVT DstVT = Op->getSimpleValueType(0);

21951

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

21952

21953

// Bail out when we don't have native conversion instructions.

21954

if (DstVT == MVT::f128)

21955

return SDValue();

21956

21957

if (isSoftFP16(DstVT))

21958

return promoteXINT_TO_FP(Op, DAG);

21959

else if (isLegalConversion(SrcVT, false, Subtarget))

21960

return Op;

21961

21962

if (DstVT.isVector())

21963

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

21964

21965

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21966

return LowerWin64_INT128_TO_FP(Op, DAG);

21967

21968

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21969

return Extract;

21970

21971

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

21972

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

21973

// Conversions from unsigned i32 to f32/f64 are legal,

21974

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

21975

return Op;

21976

}

21977

21978

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

21979

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

21980

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

21981

if (IsStrict)

21982

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

21983

{Chain, Src});

21984

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

21985

}

21986

21987

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21988

return V;

21989

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21990

return V;

21991

21992

// The transform for i64->f64 isn't correct for 0 when rounding to negative

21993

// infinity. It produces -0.0, so disable under strictfp.

21994

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

21995

!IsStrict)

21996

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

21997

// The transform for i32->f64/f32 isn't correct for 0 when rounding to

21998

// negative infinity. So disable under strictfp. Using FILD instead.

21999

if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

22000

!IsStrict)

22001

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

22002

if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

22003

(DstVT == MVT::f32 || DstVT == MVT::f64))

22004

return SDValue();

22005

22006

// Make a 64-bit buffer, and use it to build an FILD.

22007

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

22008

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

22009

Align SlotAlign(8);

22010

MachinePointerInfo MPI =

22011

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

22012

if (SrcVT == MVT::i32) {

22013

SDValue OffsetSlot =

22014

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

22015

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

22016

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

22017

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

22018

std::pair<SDValue, SDValue> Tmp =

22019

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

22020

if (IsStrict)

22021

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

22022

22023

return Tmp.first;

22024

}

22025

22026

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22026, __extension__
__PRETTY_FUNCTION__));

22027

SDValue ValueToStore = Src;

22028

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

22029

// Bitcasting to f64 here allows us to do a single 64-bit store from

22030

// an SSE register, avoiding the store forwarding penalty that would come

22031

// with two 32-bit stores.

22032

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

22033

}

22034

SDValue Store =

22035

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

22036

// For i64 source, we need to add the appropriate power of 2 if the input

22037

// was negative. We must be careful to do the computation in x87 extended

22038

// precision, not in SSE.

22039

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22040

SDValue Ops[] = { Store, StackSlot };

22041

SDValue Fild =

22042

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

22043

SlotAlign, MachineMemOperand::MOLoad);

22044

Chain = Fild.getValue(1);

22045

22046

22047

// Check whether the sign bit is set.

22048

SDValue SignSet = DAG.getSetCC(

22049

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

22050

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

22051

22052

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

22053

APInt FF(64, 0x5F80000000000000ULL);

22054

SDValue FudgePtr = DAG.getConstantPool(

22055

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

22056

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

22057

22058

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

22059

SDValue Zero = DAG.getIntPtrConstant(0, dl);

22060

SDValue Four = DAG.getIntPtrConstant(4, dl);

22061

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

22062

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

22063

22064

// Load the value out, extending it from f32 to f80.

22065

SDValue Fudge = DAG.getExtLoad(

22066

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

22067

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

22068

CPAlignment);

22069

Chain = Fudge.getValue(1);

22070

// Extend everything to 80 bits to force it to be done on x87.

22071

// TODO: Are there any fast-math-flags to propagate here?

22072

if (IsStrict) {

22073

unsigned Opc = ISD::STRICT_FADD;

22074

// Windows needs the precision control changed to 80bits around this add.

22075

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22076

Opc = X86ISD::STRICT_FP80_ADD;

22077

22078

SDValue Add =

22079

DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});

22080

// STRICT_FP_ROUND can't handle equal types.

22081

if (DstVT == MVT::f80)

22082

return Add;

22083

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

22084

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

22085

}

22086

unsigned Opc = ISD::FADD;

22087

// Windows needs the precision control changed to 80bits around this add.

22088

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22089

Opc = X86ISD::FP80_ADD;

22090

22091

SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);

22092

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

22093

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

22094

}

22095

22096

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

22097

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

22098

// just return an SDValue().

22099

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

22100

// to i16, i32 or i64, and we lower it to a legal sequence and return the

22101

// result.

22102

SDValue

22103

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

22104

bool IsSigned, SDValue &Chain) const {

22105

bool IsStrict = Op->isStrictFPOpcode();

22106

SDLoc DL(Op);

22107

22108

EVT DstTy = Op.getValueType();

22109

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

22110

EVT TheVT = Value.getValueType();

22111

auto PtrVT = getPointerTy(DAG.getDataLayout());

22112

22113

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

22114

// f16 must be promoted before using the lowering in this routine.

22115

// fp128 does not use this lowering.

22116

return SDValue();

22117

}

22118

22119

// If using FIST to compute an unsigned i64, we'll need some fixup

22120

// to handle values above the maximum signed i64. A FIST is always

22121

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

22122

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

22123

22124

// FIXME: This does not generate an invalid exception if the input does not

22125

// fit in i32. PR44019

22126

if (!IsSigned && DstTy != MVT::i64) {

22127

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

22128

// The low 32 bits of the fist result will have the correct uint32 result.

22129

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22129, __extension__
__PRETTY_FUNCTION__));

22130

DstTy = MVT::i64;

22131

}

22132

22133

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22135, __extension__
__PRETTY_FUNCTION__))

22134

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22135, __extension__
__PRETTY_FUNCTION__))

22135

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22135, __extension__
__PRETTY_FUNCTION__));

22136

22137

// We lower FP->int64 into FISTP64 followed by a load from a temporary

22138

// stack slot.

22139

MachineFunction &MF = DAG.getMachineFunction();

22140

unsigned MemSize = DstTy.getStoreSize();

22141

int SSFI =

22142

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

22143

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

22144

22145

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22146

22147

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

22148

22149

if (UnsignedFixup) {

22150

//

22151

// Conversion to unsigned i64 is implemented with a select,

22152

// depending on whether the source value fits in the range

22153

// of a signed i64. Let Thresh be the FP equivalent of

22154

// 0x8000000000000000ULL.

22155

//

22156

// Adjust = (Value >= Thresh) ? 0x80000000 : 0;

22157

// FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

22158

// FistSrc = (Value - FltOfs);

22159

// Fist-to-mem64 FistSrc

22160

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

22161

// to XOR'ing the high 32 bits with Adjust.

22162

//

22163

// Being a power of 2, Thresh is exactly representable in all FP formats.

22164

// For X87 we'd like to use the smallest FP type for this constant, but

22165

// for DAG type consistency we have to match the FP operand type.

22166

22167

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

22168

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

22169

bool LosesInfo = false;

22170

if (TheVT == MVT::f64)

22171

// The rounding mode is irrelevant as the conversion should be exact.

22172

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

22173

&LosesInfo);

22174

else if (TheVT == MVT::f80)

22175

Status = Thresh.convert(APFloat::x87DoubleExtended(),

22176

APFloat::rmNearestTiesToEven, &LosesInfo);

22177

22178

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22179, __extension__
__PRETTY_FUNCTION__))

22179

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22179, __extension__
__PRETTY_FUNCTION__));

22180

22181

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

22182

22183

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

22184

*DAG.getContext(), TheVT);

22185

SDValue Cmp;

22186

if (IsStrict) {

22187

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

22188

/*IsSignaling*/ true);

22189

Chain = Cmp.getValue(1);

22190

} else {

22191

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

22192

}

22193

22194

// Our preferred lowering of

22195

//

22196

// (Value >= Thresh) ? 0x8000000000000000ULL : 0

22197

//

22198

// is

22199

//

22200

// (Value >= Thresh) << 63

22201

//

22202

// but since we can get here after LegalOperations, DAGCombine might do the

22203

// wrong thing if we create a select. So, directly create the preferred

22204

// version.

22205

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

22206

SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

22207

Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);

22208

22209

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

22210

DAG.getConstantFP(0.0, DL, TheVT));

22211

22212

if (IsStrict) {

22213

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

22214

{ Chain, Value, FltOfs });

22215

Chain = Value.getValue(1);

22216

} else

22217

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

22218

}

22219

22220

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

22221

22222

// FIXME This causes a redundant load/store if the SSE-class value is already

22223

// in memory, such as if it is on the callstack.

22224

if (isScalarFPTypeInSSEReg(TheVT)) {

22225

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22225, __extension__
__PRETTY_FUNCTION__));

22226

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

22227

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22228

SDValue Ops[] = { Chain, StackSlot };

22229

22230

unsigned FLDSize = TheVT.getStoreSize();

22231

assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22231, __extension__
__PRETTY_FUNCTION__));

22232

MachineMemOperand *MMO = MF.getMachineMemOperand(

22233

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

22234

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

22235

Chain = Value.getValue(1);

22236

}

22237

22238

// Build the FP_TO_INT*_IN_MEM

22239

MachineMemOperand *MMO = MF.getMachineMemOperand(

22240

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

22241

SDValue Ops[] = { Chain, Value, StackSlot };

22242

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

22243

DAG.getVTList(MVT::Other),

22244

Ops, DstTy, MMO);

22245

22246

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

22247

Chain = Res.getValue(1);

22248

22249

// If we need an unsigned fixup, XOR the result with adjust.

22250

if (UnsignedFixup)

22251

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

22252

22253

return Res;

22254

}

22255

22256

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

22257

const X86Subtarget &Subtarget) {

22258

MVT VT = Op.getSimpleValueType();

22259

SDValue In = Op.getOperand(0);

22260

MVT InVT = In.getSimpleValueType();

22261

SDLoc dl(Op);

22262

unsigned Opc = Op.getOpcode();

22263

22264

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22264, __extension__
__PRETTY_FUNCTION__));

22265

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22266, __extension__
__PRETTY_FUNCTION__))

22266

"Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22266, __extension__
__PRETTY_FUNCTION__));

22267

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22268, __extension__
__PRETTY_FUNCTION__))

22268

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22268, __extension__
__PRETTY_FUNCTION__));

22269

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22272, __extension__
__PRETTY_FUNCTION__))

22270

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22272, __extension__
__PRETTY_FUNCTION__))

22271

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22272, __extension__
__PRETTY_FUNCTION__))

22272

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22272, __extension__
__PRETTY_FUNCTION__));

22273

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22276, __extension__
__PRETTY_FUNCTION__))

22274

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22276, __extension__
__PRETTY_FUNCTION__))

22275

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22276, __extension__
__PRETTY_FUNCTION__))

22276

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22276, __extension__
__PRETTY_FUNCTION__));

22277

22278

unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);

22279

22280

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

22281

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22281, __extension__
__PRETTY_FUNCTION__));

22282

return splitVectorIntUnary(Op, DAG);

22283

}

22284

22285

if (Subtarget.hasInt256())

22286

return Op;

22287

22288

// Optimize vectors in AVX mode:

22289

//

22290

// v8i16 -> v8i32

22291

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

22292

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

22293

// Concat upper and lower parts.

22294

//

22295

// v4i32 -> v4i64

22296

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

22297

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

22298

// Concat upper and lower parts.

22299

//

22300

MVT HalfVT = VT.getHalfNumVectorElementsVT();

22301

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

22302

22303

// Short-circuit if we can determine that each 128-bit half is the same value.

22304

// Otherwise, this is difficult to match and optimize.

22305

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

22306

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

22307

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

22308

22309

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

22310

SDValue Undef = DAG.getUNDEF(InVT);

22311

bool NeedZero = Opc == ISD::ZERO_EXTEND;

22312

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

22313

OpHi = DAG.getBitcast(HalfVT, OpHi);

22314

22315

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

22316

}

22317

22318

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

22319

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

22320

const SDLoc &dl, SelectionDAG &DAG) {

22321

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22321, __extension__
__PRETTY_FUNCTION__));

22322

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22323

DAG.getIntPtrConstant(0, dl));

22324

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22325

DAG.getIntPtrConstant(8, dl));

22326

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

22327

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

22328

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

22329

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22330

}

22331

22332

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

22333

const X86Subtarget &Subtarget,

22334

SelectionDAG &DAG) {

22335

MVT VT = Op->getSimpleValueType(0);

22336

SDValue In = Op->getOperand(0);

22337

MVT InVT = In.getSimpleValueType();

22338

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22338, __extension__
__PRETTY_FUNCTION__));

22339

SDLoc DL(Op);

22340

unsigned NumElts = VT.getVectorNumElements();

22341

22342

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

22343

// avoids a constant pool load.

22344

if (VT.getVectorElementType() != MVT::i8) {

22345

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

22346

return DAG.getNode(ISD::SRL, DL, VT, Extend,

22347

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

22348

}

22349

22350

// Extend VT if BWI is not supported.

22351

MVT ExtVT = VT;

22352

if (!Subtarget.hasBWI()) {

22353

// If v16i32 is to be avoided, we'll need to split and concatenate.

22354

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

22355

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

22356

22357

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

22358

}

22359

22360

// Widen to 512-bits if VLX is not supported.

22361

MVT WideVT = ExtVT;

22362

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

22363

NumElts *= 512 / ExtVT.getSizeInBits();

22364

InVT = MVT::getVectorVT(MVT::i1, NumElts);

22365

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

22366

In, DAG.getIntPtrConstant(0, DL));

22367

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

22368

NumElts);

22369

}

22370

22371

SDValue One = DAG.getConstant(1, DL, WideVT);

22372

SDValue Zero = DAG.getConstant(0, DL, WideVT);

22373

22374

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

22375

22376

// Truncate if we had to extend above.

22377

if (VT != ExtVT) {

22378

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

22379

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

22380

}

22381

22382

// Extract back to 128/256-bit if we widened.

22383

if (WideVT != VT)

22384

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

22385

DAG.getIntPtrConstant(0, DL));

22386

22387

return SelectedVal;

22388

}

22389

22390

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

22391

SelectionDAG &DAG) {

22392

SDValue In = Op.getOperand(0);

22393

MVT SVT = In.getSimpleValueType();

22394

22395

if (SVT.getVectorElementType() == MVT::i1)

22396

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

22397

22398

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22398, __extension__
__PRETTY_FUNCTION__));

22399

return LowerAVXExtend(Op, DAG, Subtarget);

22400

}

22401

22402

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

22403

/// It makes use of the fact that vectors with enough leading sign/zero bits

22404

/// prevent the PACKSS/PACKUS from saturating the results.

22405

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

22406

/// within each 128-bit lane.

22407

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

22408

const SDLoc &DL, SelectionDAG &DAG,

22409

const X86Subtarget &Subtarget) {

22410

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22411, __extension__
__PRETTY_FUNCTION__))

22411

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22411, __extension__
__PRETTY_FUNCTION__));

22412

assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22412, __extension__
__PRETTY_FUNCTION__));

22413

22414

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

22415

if (!Subtarget.hasSSE2())

22416

return SDValue();

22417

22418

EVT SrcVT = In.getValueType();

22419

22420

// No truncation required, we might get here due to recursive calls.

22421

if (SrcVT == DstVT)

22422

return In;

22423

22424

// We only support vector truncation to 64bits or greater from a

22425

// 128bits or greater source.

22426

unsigned DstSizeInBits = DstVT.getSizeInBits();

22427

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

22428

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

22429

return SDValue();

22430

22431

unsigned NumElems = SrcVT.getVectorNumElements();

22432

if (!isPowerOf2_32(NumElems))

22433

return SDValue();

22434

22435

LLVMContext &Ctx = *DAG.getContext();

22436

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22436, __extension__
__PRETTY_FUNCTION__));

22437

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22437, __extension__
__PRETTY_FUNCTION__));

22438

22439

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

22440

22441

// Pack to the largest type possible:

22442

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

22443

EVT InVT = MVT::i16, OutVT = MVT::i8;

22444

if (SrcVT.getScalarSizeInBits() > 16 &&

22445

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

22446

InVT = MVT::i32;

22447

OutVT = MVT::i16;

22448

}

22449

22450

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

22451

if (SrcVT.is128BitVector()) {

22452

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

22453

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

22454

In = DAG.getBitcast(InVT, In);

22455

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

22456

Res = extractSubVector(Res, 0, DAG, DL, 64);

22457

return DAG.getBitcast(DstVT, Res);

22458

}

22459

22460

// Split lower/upper subvectors.

22461

SDValue Lo, Hi;

22462

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

22463

22464

unsigned SubSizeInBits = SrcSizeInBits / 2;

22465

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

22466

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

22467

22468

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

22469

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

22470

Lo = DAG.getBitcast(InVT, Lo);

22471

Hi = DAG.getBitcast(InVT, Hi);

22472

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22473

return DAG.getBitcast(DstVT, Res);

22474

}

22475

22476

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

22477

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

22478

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

22479

Lo = DAG.getBitcast(InVT, Lo);

22480

Hi = DAG.getBitcast(InVT, Hi);

22481

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22482

22483

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

22484

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

22485

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

22486

SmallVector<int, 64> Mask;

22487

int Scale = 64 / OutVT.getScalarSizeInBits();

22488

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

22489

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

22490

22491

if (DstVT.is256BitVector())

22492

return DAG.getBitcast(DstVT, Res);

22493

22494

// If 512bit -> 128bit truncate another stage.

22495

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22496

Res = DAG.getBitcast(PackedVT, Res);

22497

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22498

}

22499

22500

// Recursively pack lower/upper subvectors, concat result and pack again.

22501

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22501, __extension__
__PRETTY_FUNCTION__));

22502

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

22503

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

22504

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

22505

22506

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22507

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

22508

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22509

}

22510

22511

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

22512

const X86Subtarget &Subtarget) {

22513

22514

SDLoc DL(Op);

22515

MVT VT = Op.getSimpleValueType();

22516

SDValue In = Op.getOperand(0);

22517

MVT InVT = In.getSimpleValueType();

22518

22519

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__
__PRETTY_FUNCTION__));

22520

22521

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

22522

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

22523

if (InVT.getScalarSizeInBits() <= 16) {

22524

if (Subtarget.hasBWI()) {

22525

// legal, will go to VPMOVB2M, VPMOVW2M

22526

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22527

// We need to shift to get the lsb into sign position.

22528

// Shift packed bytes not supported natively, bitcast to word

22529

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

22530

In = DAG.getNode(ISD::SHL, DL, ExtVT,

22531

DAG.getBitcast(ExtVT, In),

22532

DAG.getConstant(ShiftInx, DL, ExtVT));

22533

In = DAG.getBitcast(InVT, In);

22534

}

22535

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

22536

In, ISD::SETGT);

22537

}

22538

// Use TESTD/Q, extended vector to packed dword/qword.

22539

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22540, __extension__
__PRETTY_FUNCTION__))

22540

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22540, __extension__
__PRETTY_FUNCTION__));

22541

unsigned NumElts = InVT.getVectorNumElements();

22542

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22542, __extension__
__PRETTY_FUNCTION__));

22543

// We need to change to a wider element type that we have support for.

22544

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

22545

// For 16 element vectors we extend to v16i32 unless we are explicitly

22546

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

22547

// we need to split into two 8 element vectors which we can extend to v8i32,

22548

// truncate and concat the results. There's an additional complication if

22549

// the original type is v16i8. In that case we can't split the v16i8

22550

// directly, so we need to shuffle high elements to low and use

22551

// sign_extend_vector_inreg.

22552

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

22553

SDValue Lo, Hi;

22554

if (InVT == MVT::v16i8) {

22555

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

22556

Hi = DAG.getVectorShuffle(

22557

InVT, DL, In, In,

22558

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

22559

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

22560

} else {

22561

assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22561, __extension__
__PRETTY_FUNCTION__));

22562

Lo = extract128BitVector(In, 0, DAG, DL);

22563

Hi = extract128BitVector(In, 8, DAG, DL);

22564

}

22565

// We're split now, just emit two truncates and a concat. The two

22566

// truncates will trigger legalization to come back to this function.

22567

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

22568

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

22569

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22570

}

22571

// We either have 8 elements or we're allowed to use 512-bit vectors.

22572

// If we have VLX, we want to use the narrowest vector that can get the

22573

// job done so we use vXi32.

22574

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

22575

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

22576

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

22577

InVT = ExtVT;

22578

ShiftInx = InVT.getScalarSizeInBits() - 1;

22579

}

22580

22581

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22582

// We need to shift to get the lsb into sign position.

22583

In = DAG.getNode(ISD::SHL, DL, InVT, In,

22584

DAG.getConstant(ShiftInx, DL, InVT));

22585

}

22586

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

22587

if (Subtarget.hasDQI())

22588

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

22589

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

22590

}

22591

22592

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

22593

SDLoc DL(Op);

22594

MVT VT = Op.getSimpleValueType();

22595

SDValue In = Op.getOperand(0);

22596

MVT InVT = In.getSimpleValueType();

22597

unsigned InNumEltBits = InVT.getScalarSizeInBits();

22598

22599

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22600, __extension__
__PRETTY_FUNCTION__))

22600

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22600, __extension__
__PRETTY_FUNCTION__));

22601

22602

// If we're called by the type legalizer, handle a few cases.

22603

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22604

if (!TLI.isTypeLegal(InVT)) {

22605

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

22606

VT.is128BitVector()) {

22607

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22608, __extension__
__PRETTY_FUNCTION__))

22608

"Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22608, __extension__
__PRETTY_FUNCTION__));

22609

// The default behavior is to truncate one step, concatenate, and then

22610

// truncate the remainder. We'd rather produce two 64-bit results and

22611

// concatenate those.

22612

SDValue Lo, Hi;

22613

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

22614

22615

EVT LoVT, HiVT;

22616

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

22617

22618

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

22619

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

22620

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22621

}

22622

22623

// Otherwise let default legalization handle it.

22624

return SDValue();

22625

}

22626

22627

if (VT.getVectorElementType() == MVT::i1)

22628

return LowerTruncateVecI1(Op, DAG, Subtarget);

22629

22630

// vpmovqb/w/d, vpmovdb/w, vpmovwb

22631

if (Subtarget.hasAVX512()) {

22632

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

22633

assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22633, __extension__
__PRETTY_FUNCTION__));

22634

return splitVectorIntUnary(Op, DAG);

22635

}

22636

22637

// word to byte only under BWI. Otherwise we have to promoted to v16i32

22638

// and then truncate that. But we should only do that if we haven't been

22639

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

22640

// handled by isel patterns.

22641

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

22642

Subtarget.canExtendTo512DQ())

22643

return Op;

22644

}

22645

22646

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

22647

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

22648

22649

// Truncate with PACKUS if we are truncating a vector with leading zero bits

22650

// that extend all the way to the packed/truncated value.

22651

// Pre-SSE41 we can only use PACKUSWB.

22652

KnownBits Known = DAG.computeKnownBits(In);

22653

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

22654

if (SDValue V =

22655

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

22656

return V;

22657

22658

// Truncate with PACKSS if we are truncating a vector with sign-bits that

22659

// extend all the way to the packed/truncated value.

22660

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

22661

if (SDValue V =

22662

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

22663

return V;

22664

22665

// Handle truncation of V256 to V128 using shuffles.

22666

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22666, __extension__
__PRETTY_FUNCTION__));

22667

22668

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

22669

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

22670

if (Subtarget.hasInt256()) {

22671

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

22672

In = DAG.getBitcast(MVT::v8i32, In);

22673

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

22674

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

22675

DAG.getIntPtrConstant(0, DL));

22676

}

22677

22678

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22679

DAG.getIntPtrConstant(0, DL));

22680

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22681

DAG.getIntPtrConstant(2, DL));

22682

static const int ShufMask[] = {0, 2, 4, 6};

22683

return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),

22684

DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);

22685

}

22686

22687

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

22688

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

22689

if (Subtarget.hasInt256()) {

22690

// The PSHUFB mask:

22691

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

22692

-1, -1, -1, -1, -1, -1, -1, -1,

22693

16, 17, 20, 21, 24, 25, 28, 29,

22694

-1, -1, -1, -1, -1, -1, -1, -1 };

22695

In = DAG.getBitcast(MVT::v32i8, In);

22696

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

22697

In = DAG.getBitcast(MVT::v4i64, In);

22698

22699

static const int ShufMask2[] = {0, 2, -1, -1};

22700

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

22701

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22702

DAG.getIntPtrConstant(0, DL));

22703

return DAG.getBitcast(MVT::v8i16, In);

22704

}

22705

22706

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22707

DAG.getIntPtrConstant(0, DL));

22708

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22709

DAG.getIntPtrConstant(4, DL));

22710

22711

// The PSHUFB mask:

22712

static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};

22713

22714

OpLo = DAG.getBitcast(MVT::v8i16, OpLo);

22715

OpHi = DAG.getBitcast(MVT::v8i16, OpHi);

22716

22717

OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);

22718

OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);

22719

22720

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

22721

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

22722

22723

// The MOVLHPS Mask:

22724

static const int ShufMask2[] = {0, 1, 4, 5};

22725

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

22726

return DAG.getBitcast(MVT::v8i16, res);

22727

}

22728

22729

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

22730

// Use an AND to zero uppper bits for PACKUS.

22731

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

22732

22733

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22734

DAG.getIntPtrConstant(0, DL));

22735

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22736

DAG.getIntPtrConstant(8, DL));

22737

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

22738

}

22739

22740

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22740);

22741

}

22742

22743

// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

22744

// behaves on out of range inputs to generate optimized conversions.

22745

static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

22746

SelectionDAG &DAG,

22747

const X86Subtarget &Subtarget) {

22748

MVT SrcVT = Src.getSimpleValueType();

22749

unsigned DstBits = VT.getScalarSizeInBits();

22750

assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22750, __extension__
__PRETTY_FUNCTION__));

22751

22752

// Calculate the converted result for values in the range 0 to

22753

// 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

22754

SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

22755

SDValue Big =

22756

DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

22757

DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

22758

DAG.getConstantFP(2147483648.0f, dl, SrcVT)));

22759

22760

// The "CVTTP2SI" instruction conveniently sets the sign bit if

22761

// and only if the value was out of range. So we can use that

22762

// as our indicator that we rather use "Big" instead of "Small".

22763

//

22764

// Use "Small" if "IsOverflown" has all bits cleared

22765

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

22766

22767

// AVX1 can't use the signsplat masking for 256-bit vectors - we have to

22768

// use the slightly slower blendv select instead.

22769

if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

22770

SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

22771

return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

22772

}

22773

22774

SDValue IsOverflown =

22775

DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

22776

DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

22777

return DAG.getNode(ISD::OR, dl, VT, Small,

22778

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22779

}

22780

22781

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

22782

bool IsStrict = Op->isStrictFPOpcode();

22783

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

22784

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

22785

MVT VT = Op->getSimpleValueType(0);

22786

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22787

SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

22788

MVT SrcVT = Src.getSimpleValueType();

22789

SDLoc dl(Op);

22790

22791

SDValue Res;

22792

if (isSoftFP16(SrcVT)) {

22793

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

22794

if (IsStrict)

22795

return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

22796

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

22797

{NVT, MVT::Other}, {Chain, Src})});

22798

return DAG.getNode(Op.getOpcode(), dl, VT,

22799

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

22800

} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {

22801

return Op;

22802

}

22803

22804

if (VT.isVector()) {

22805

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

22806

MVT ResVT = MVT::v4i32;

22807

MVT TruncVT = MVT::v4i1;

22808

unsigned Opc;

22809

if (IsStrict)

22810

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

22811

else

22812

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

22813

22814

if (!IsSigned && !Subtarget.hasVLX()) {

22815

assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22815, __extension__
__PRETTY_FUNCTION__));

22816

// Widen to 512-bits.

22817

ResVT = MVT::v8i32;

22818

TruncVT = MVT::v8i1;

22819

Opc = Op.getOpcode();

22820

// Need to concat with zero vector for strict fp to avoid spurious

22821

// exceptions.

22822

// TODO: Should we just do this for non-strict as well?

22823

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

22824

: DAG.getUNDEF(MVT::v8f64);

22825

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

22826

DAG.getIntPtrConstant(0, dl));

22827

}

22828

if (IsStrict) {

22829

Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

22830

Chain = Res.getValue(1);

22831

} else {

22832

Res = DAG.getNode(Opc, dl, ResVT, Src);

22833

}

22834

22835

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

22836

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

22837

DAG.getIntPtrConstant(0, dl));

22838

if (IsStrict)

22839

return DAG.getMergeValues({Res, Chain}, dl);

22840

return Res;

22841

}

22842

22843

if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

22844

if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)

22845

return Op;

22846

22847

MVT ResVT = VT;

22848

MVT EleVT = VT.getVectorElementType();

22849

if (EleVT != MVT::i64)

22850

ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

22851

22852

if (SrcVT != MVT::v8f16) {

22853

SDValue Tmp =

22854

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

22855

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

22856

Ops[0] = Src;

22857

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

22858

}

22859

22860

if (IsStrict) {

22861

Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

22862

: X86ISD::STRICT_CVTTP2UI,

22863

dl, {ResVT, MVT::Other}, {Chain, Src});

22864

Chain = Res.getValue(1);

22865

} else {

22866

Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

22867

ResVT, Src);

22868

}

22869

22870

// TODO: Need to add exception check code for strict FP.

22871

if (EleVT.getSizeInBits() < 16) {

22872

ResVT = MVT::getVectorVT(EleVT, 8);

22873

Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

22874

}

22875

22876

if (ResVT != VT)

22877

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22878

DAG.getIntPtrConstant(0, dl));

22879

22880

if (IsStrict)

22881

return DAG.getMergeValues({Res, Chain}, dl);

22882

return Res;

22883

}

22884

22885

// v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.

22886

if (VT.getVectorElementType() == MVT::i16) {

22887

assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22889, __extension__
__PRETTY_FUNCTION__))

22888

SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22889, __extension__
__PRETTY_FUNCTION__))

22889

"Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22889, __extension__
__PRETTY_FUNCTION__));

22890

MVT NVT = VT.changeVectorElementType(MVT::i32);

22891

if (IsStrict) {

22892

Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

22893

: ISD::STRICT_FP_TO_UINT,

22894

dl, {NVT, MVT::Other}, {Chain, Src});

22895

Chain = Res.getValue(1);

22896

} else {

22897

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

22898

NVT, Src);

22899

}

22900

22901

// TODO: Need to add exception check code for strict FP.

22902

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22903

22904

if (IsStrict)

22905

return DAG.getMergeValues({Res, Chain}, dl);

22906

return Res;

22907

}

22908

22909

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

22910

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

22911

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22911, __extension__
__PRETTY_FUNCTION__));

22912

assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22912, __extension__
__PRETTY_FUNCTION__));

22913

return Op;

22914

}

22915

22916

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

22917

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

22918

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

22919

Subtarget.useAVX512Regs()) {

22920

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22920, __extension__
__PRETTY_FUNCTION__));

22921

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22921, __extension__
__PRETTY_FUNCTION__));

22922

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

22923

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

22924

// Need to concat with zero vector for strict fp to avoid spurious

22925

// exceptions.

22926

// TODO: Should we just do this for non-strict as well?

22927

SDValue Tmp =

22928

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

22929

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

22930

DAG.getIntPtrConstant(0, dl));

22931

22932

if (IsStrict) {

22933

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

22934

{Chain, Src});

22935

Chain = Res.getValue(1);

22936

} else {

22937

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

22938

}

22939

22940

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22941

DAG.getIntPtrConstant(0, dl));

22942

22943

if (IsStrict)

22944

return DAG.getMergeValues({Res, Chain}, dl);

22945

return Res;

22946

}

22947

22948

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

22949

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

22950

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

22951

Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

22952

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22952, __extension__
__PRETTY_FUNCTION__));

22953

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

22954

// Need to concat with zero vector for strict fp to avoid spurious

22955

// exceptions.

22956

// TODO: Should we just do this for non-strict as well?

22957

SDValue Tmp =

22958

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

22959

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

22960

DAG.getIntPtrConstant(0, dl));

22961

22962

if (IsStrict) {

22963

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

22964

{Chain, Src});

22965

Chain = Res.getValue(1);

22966

} else {

22967

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

22968

}

22969

22970

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22971

DAG.getIntPtrConstant(0, dl));

22972

22973

if (IsStrict)

22974

return DAG.getMergeValues({Res, Chain}, dl);

22975

return Res;

22976

}

22977

22978

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

22979

if (!Subtarget.hasVLX()) {

22980

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

22981

// legalizer and then widened again by vector op legalization.

22982

if (!IsStrict)

22983

return SDValue();

22984

22985

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

22986

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

22987

{Src, Zero, Zero, Zero});

22988

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

22989

{Chain, Tmp});

22990

SDValue Chain = Tmp.getValue(1);

22991

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

22992

DAG.getIntPtrConstant(0, dl));

22993

return DAG.getMergeValues({Tmp, Chain}, dl);

22994

}

22995

22996

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22996, __extension__
__PRETTY_FUNCTION__));

22997

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

22998

DAG.getUNDEF(MVT::v2f32));

22999

if (IsStrict) {

23000

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

23001

: X86ISD::STRICT_CVTTP2UI;

23002

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

23003

}

23004

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

23005

return DAG.getNode(Opc, dl, VT, Tmp);

23006

}

23007

23008

// Generate optimized instructions for pre AVX512 unsigned conversions from

23009

// vXf32 to vXi32.

23010

if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

23011

(VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

23012

(VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

23013

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23013, __extension__
__PRETTY_FUNCTION__));

23014

return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

23015

}

23016

23017

return SDValue();

23018

}

23019

23020

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23020, __extension__ __PRETTY_FUNCTION__));

23021

23022

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

23023

23024

if (!IsSigned && UseSSEReg) {

23025

// Conversions from f32/f64 with AVX512 should be legal.

23026

if (Subtarget.hasAVX512())

23027

return Op;

23028

23029

// We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

23030

// behaves on out of range inputs to generate optimized conversions.

23031

if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

23032

(VT == MVT::i64 && Subtarget.is64Bit()))) {

23033

unsigned DstBits = VT.getScalarSizeInBits();

23034

APInt UIntLimit = APInt::getSignMask(DstBits);

23035

SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

23036

DAG.getConstant(UIntLimit, dl, VT));

23037

MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());

23038

23039

// Calculate the converted result for values in the range:

23040

// (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

23041

// (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

23042

SDValue Small =

23043

DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

23044

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

23045

SDValue Big = DAG.getNode(

23046

X86ISD::CVTTS2SI, dl, VT,

23047

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

23048

DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));

23049

23050

// The "CVTTS2SI" instruction conveniently sets the sign bit if

23051

// and only if the value was out of range. So we can use that

23052

// as our indicator that we rather use "Big" instead of "Small".

23053

//

23054

// Use "Small" if "IsOverflown" has all bits cleared

23055

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

23056

SDValue IsOverflown = DAG.getNode(

23057

ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

23058

return DAG.getNode(ISD::OR, dl, VT, Small,

23059

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

23060

}

23061

23062

// Use default expansion for i64.

23063

if (VT == MVT::i64)

23064

return SDValue();

23065

23066

assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23066, __extension__
__PRETTY_FUNCTION__));

23067

23068

// Promote i32 to i64 and use a signed operation on 64-bit targets.

23069

// FIXME: This does not generate an invalid exception if the input does not

23070

// fit in i32. PR44019

23071

if (Subtarget.is64Bit()) {

23072

if (IsStrict) {

23073

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

23074

{Chain, Src});

23075

Chain = Res.getValue(1);

23076

} else

23077

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

23078

23079

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23080

if (IsStrict)

23081

return DAG.getMergeValues({Res, Chain}, dl);

23082

return Res;

23083

}

23084

23085

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

23086

// use fisttp which will be handled later.

23087

if (!Subtarget.hasSSE3())

23088

return SDValue();

23089

}

23090

23091

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

23092

// FIXME: This does not generate an invalid exception if the input does not

23093

// fit in i16. PR44019

23094

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

23095

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23095, __extension__
__PRETTY_FUNCTION__));

23096

if (IsStrict) {

23097

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

23098

{Chain, Src});

23099

Chain = Res.getValue(1);

23100

} else

23101

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

23102

23103

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23104

if (IsStrict)

23105

return DAG.getMergeValues({Res, Chain}, dl);

23106

return Res;

23107

}

23108

23109

// If this is a FP_TO_SINT using SSEReg we're done.

23110

if (UseSSEReg && IsSigned)

23111

return Op;

23112

23113

// fp128 needs to use a libcall.

23114

if (SrcVT == MVT::f128) {

23115

RTLIB::Libcall LC;

23116

if (IsSigned)

23117

LC = RTLIB::getFPTOSINT(SrcVT, VT);

23118

else

23119

LC = RTLIB::getFPTOUINT(SrcVT, VT);

23120

23121

MakeLibCallOptions CallOptions;

23122

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

23123

SDLoc(Op), Chain);

23124

23125

if (IsStrict)

23126

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

23127

23128

return Tmp.first;

23129

}

23130

23131

// Fall back to X87.

23132

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

23133

if (IsStrict)

23134

return DAG.getMergeValues({V, Chain}, dl);

23135

return V;

23136

}

23137

23138

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23138);

23139

}

23140

23141

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

23142

SelectionDAG &DAG) const {

23143

SDValue Src = Op.getOperand(0);

23144

MVT SrcVT = Src.getSimpleValueType();

23145

23146

if (SrcVT == MVT::f16)

23147

return SDValue();

23148

23149

// If the source is in an SSE register, the node is Legal.

23150

if (isScalarFPTypeInSSEReg(SrcVT))

23151

return Op;

23152

23153

return LRINT_LLRINTHelper(Op.getNode(), DAG);

23154

}

23155

23156

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

23157

SelectionDAG &DAG) const {

23158

EVT DstVT = N->getValueType(0);

23159

SDValue Src = N->getOperand(0);

23160

EVT SrcVT = Src.getValueType();

23161

23162

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

23163

// f16 must be promoted before using the lowering in this routine.

23164

// fp128 does not use this lowering.

23165

return SDValue();

23166

}

23167

23168

SDLoc DL(N);

23169

SDValue Chain = DAG.getEntryNode();

23170

23171

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

23172

23173

// If we're converting from SSE, the stack slot needs to hold both types.

23174

// Otherwise it only needs to hold the DstVT.

23175

EVT OtherVT = UseSSE ? SrcVT : DstVT;

23176

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

23177

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

23178

MachinePointerInfo MPI =

23179

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

23180

23181

if (UseSSE) {

23182

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23182, __extension__
__PRETTY_FUNCTION__));

23183

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

23184

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

23185

SDValue Ops[] = { Chain, StackPtr };

23186

23187

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

23188

/*Align*/ std::nullopt,

23189

MachineMemOperand::MOLoad);

23190

Chain = Src.getValue(1);

23191

}

23192

23193

SDValue StoreOps[] = { Chain, Src, StackPtr };

23194

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

23195

StoreOps, DstVT, MPI, /*Align*/ std::nullopt,

23196

MachineMemOperand::MOStore);

23197

23198

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

23199

}

23200

23201

SDValue

23202

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

23203

// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

23204

// but making use of X86 specifics to produce better instruction sequences.

23205

SDNode *Node = Op.getNode();

23206

bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

23207

unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

23208

SDLoc dl(SDValue(Node, 0));

23209

SDValue Src = Node->getOperand(0);

23210

23211

// There are three types involved here: SrcVT is the source floating point

23212

// type, DstVT is the type of the result, and TmpVT is the result of the

23213

// intermediate FP_TO_*INT operation we'll use (which may be a promotion of

23214

// DstVT).

23215

EVT SrcVT = Src.getValueType();

23216

EVT DstVT = Node->getValueType(0);

23217

EVT TmpVT = DstVT;

23218

23219

// This code is only for floats and doubles. Fall back to generic code for

23220

// anything else.

23221

if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))

23222

return SDValue();

23223

23224

EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

23225

unsigned SatWidth = SatVT.getScalarSizeInBits();

23226

unsigned DstWidth = DstVT.getScalarSizeInBits();

23227

unsigned TmpWidth = TmpVT.getScalarSizeInBits();

23228

assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23229, __extension__
__PRETTY_FUNCTION__))

23229

"Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23229, __extension__
__PRETTY_FUNCTION__));

23230

23231

// Promote result of FP_TO_*INT to at least 32 bits.

23232

if (TmpWidth < 32) {

23233

TmpVT = MVT::i32;

23234

TmpWidth = 32;

23235

}

23236

23237

// Promote conversions to unsigned 32-bit to 64-bit, because it will allow

23238

// us to use a native signed conversion instead.

23239

if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

23240

TmpVT = MVT::i64;

23241

TmpWidth = 64;

23242

}

23243

23244

// If the saturation width is smaller than the size of the temporary result,

23245

// we can always use signed conversion, which is native.

23246

if (SatWidth < TmpWidth)

23247

FpToIntOpcode = ISD::FP_TO_SINT;

23248

23249

// Determine minimum and maximum integer values and their corresponding

23250

// floating-point values.

23251

APInt MinInt, MaxInt;

23252

if (IsSigned) {

23253

MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);

23254

MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);

23255

} else {

23256

MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);

23257

MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);

23258

}

23259

23260

APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23261

APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23262

23263

APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

23264

MinInt, IsSigned, APFloat::rmTowardZero);

23265

APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

23266

MaxInt, IsSigned, APFloat::rmTowardZero);

23267

bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

23268

&& !(MaxStatus & APFloat::opStatus::opInexact);

23269

23270

SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

23271

SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);

23272

23273

// If the integer bounds are exactly representable as floats, emit a

23274

// min+max+fptoi sequence. Otherwise use comparisons and selects.

23275

if (AreExactFloatBounds) {

23276

if (DstVT != TmpVT) {

23277

// Clamp by MinFloat from below. If Src is NaN, propagate NaN.

23278

SDValue MinClamped = DAG.getNode(

23279

X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

23280

// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

23281

SDValue BothClamped = DAG.getNode(

23282

X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

23283

// Convert clamped value to integer.

23284

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);

23285

23286

// NaN will become INDVAL, with the top bit set and the rest zero.

23287

// Truncation will discard the top bit, resulting in zero.

23288

return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23289

}

23290

23291

// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

23292

SDValue MinClamped = DAG.getNode(

23293

X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

23294

// Clamp by MaxFloat from above. NaN cannot occur.

23295

SDValue BothClamped = DAG.getNode(

23296

X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

23297

// Convert clamped value to integer.

23298

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);

23299

23300

if (!IsSigned) {

23301

// In the unsigned case we're done, because we mapped NaN to MinFloat,

23302

// which is zero.

23303

return FpToInt;

23304

}

23305

23306

// Otherwise, select zero if Src is NaN.

23307

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23308

return DAG.getSelectCC(

23309

dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

23310

}

23311

23312

SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

23313

SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);

23314

23315

// Result of direct conversion, which may be selected away.

23316

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);

23317

23318

if (DstVT != TmpVT) {

23319

// NaN will become INDVAL, with the top bit set and the rest zero.

23320

// Truncation will discard the top bit, resulting in zero.

23321

FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23322

}

23323

23324

SDValue Select = FpToInt;

23325

// For signed conversions where we saturate to the same size as the

23326

// result type of the fptoi instructions, INDVAL coincides with integer

23327

// minimum, so we don't need to explicitly check it.

23328

if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

23329

// If Src ULT MinFloat, select MinInt. In particular, this also selects

23330

// MinInt if Src is NaN.

23331

Select = DAG.getSelectCC(

23332

dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

23333

}

23334

23335

// If Src OGT MaxFloat, select MaxInt.

23336

Select = DAG.getSelectCC(

23337

dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);

23338

23339

// In the unsigned case we are done, because we mapped NaN to MinInt, which

23340

// is already zero. The promoted case was already handled above.

23341

if (!IsSigned || DstVT != TmpVT) {

23342

return Select;

23343

}

23344

23345

// Otherwise, select 0 if Src is NaN.

23346

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23347

return DAG.getSelectCC(

23348

dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

23349

}

23350

23351

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

23352

bool IsStrict = Op->isStrictFPOpcode();

23353

23354

SDLoc DL(Op);

23355

MVT VT = Op.getSimpleValueType();

23356

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23357

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23358

MVT SVT = In.getSimpleValueType();

23359

23360

// Let f16->f80 get lowered to a libcall, except for darwin, where we should

23361

// lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)

23362

if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&

23363

!Subtarget.getTargetTriple().isOSDarwin()))

23364

return SDValue();

23365

23366

if (SVT == MVT::f16) {

23367

if (Subtarget.hasFP16())

23368

return Op;

23369

23370

if (VT != MVT::f32) {

23371

if (IsStrict)

23372

return DAG.getNode(

23373

ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},

23374

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,

23375

{MVT::f32, MVT::Other}, {Chain, In})});

23376

23377

return DAG.getNode(ISD::FP_EXTEND, DL, VT,

23378

DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));

23379

}

23380

23381

if (!Subtarget.hasF16C()) {

23382

if (!Subtarget.getTargetTriple().isOSDarwin())

23383

return SDValue();

23384

23385

assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23385, __extension__
__PRETTY_FUNCTION__));

23386

23387

// Need a libcall, but ABI for f16 is soft-float on MacOS.

23388

TargetLowering::CallLoweringInfo CLI(DAG);

23389

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23390

23391

In = DAG.getBitcast(MVT::i16, In);

23392

TargetLowering::ArgListTy Args;

23393

TargetLowering::ArgListEntry Entry;

23394

Entry.Node = In;

23395

Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());

23396

Entry.IsSExt = false;

23397

Entry.IsZExt = true;

23398

Args.push_back(Entry);

23399

23400

SDValue Callee = DAG.getExternalSymbol(

23401

getLibcallName(RTLIB::FPEXT_F16_F32),

23402

getPointerTy(DAG.getDataLayout()));

23403

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23404

CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,

23405

std::move(Args));

23406

23407

SDValue Res;

23408

std::tie(Res,Chain) = LowerCallTo(CLI);

23409

if (IsStrict)

23410

Res = DAG.getMergeValues({Res, Chain}, DL);

23411

23412

return Res;

23413

}

23414

23415

In = DAG.getBitcast(MVT::i16, In);

23416

In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,

23417

getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,

23418

DAG.getIntPtrConstant(0, DL));

23419

SDValue Res;

23420

if (IsStrict) {

23421

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},

23422

{Chain, In});

23423

Chain = Res.getValue(1);

23424

} else {

23425

Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,

23426

DAG.getTargetConstant(4, DL, MVT::i32));

23427

}

23428

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,

23429

DAG.getIntPtrConstant(0, DL));

23430

if (IsStrict)

23431

return DAG.getMergeValues({Res, Chain}, DL);

23432

return Res;

23433

}

23434

23435

if (!SVT.isVector())

23436

return Op;

23437

23438

if (SVT.getVectorElementType() == MVT::f16) {

23439

assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23439, __extension__
__PRETTY_FUNCTION__));

23440

if (SVT == MVT::v2f16)

23441

In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

23442

DAG.getUNDEF(MVT::v2f16));

23443

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

23444

DAG.getUNDEF(MVT::v4f16));

23445

if (IsStrict)

23446

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23447

{Op->getOperand(0), Res});

23448

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23449

} else if (VT == MVT::v4f64 || VT == MVT::v8f64) {

23450

return Op;

23451

}

23452

23453

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23453, __extension__
__PRETTY_FUNCTION__));

23454

23455

SDValue Res =

23456

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

23457

if (IsStrict)

23458

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23459

{Op->getOperand(0), Res});

23460

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23461

}

23462

23463

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

23464

bool IsStrict = Op->isStrictFPOpcode();

23465

23466

SDLoc DL(Op);

23467

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23468

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23469

MVT VT = Op.getSimpleValueType();

23470

MVT SVT = In.getSimpleValueType();

23471

23472

if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))

23473

return SDValue();

23474

23475

if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&

23476

!Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {

23477

if (!Subtarget.getTargetTriple().isOSDarwin())

23478

return SDValue();

23479

23480

// We need a libcall but the ABI for f16 libcalls on MacOS is soft.

23481

TargetLowering::CallLoweringInfo CLI(DAG);

23482

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23483

23484

TargetLowering::ArgListTy Args;

23485

TargetLowering::ArgListEntry Entry;

23486

Entry.Node = In;

23487

Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());

23488

Entry.IsSExt = false;

23489

Entry.IsZExt = true;

23490

Args.push_back(Entry);

23491

23492

SDValue Callee = DAG.getExternalSymbol(

23493

getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16

23494

: RTLIB::FPROUND_F32_F16),

23495

getPointerTy(DAG.getDataLayout()));

23496

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23497

CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,

23498

std::move(Args));

23499

23500

SDValue Res;

23501

std::tie(Res, Chain) = LowerCallTo(CLI);

23502

23503

Res = DAG.getBitcast(MVT::f16, Res);

23504

23505

if (IsStrict)

23506

Res = DAG.getMergeValues({Res, Chain}, DL);

23507

23508

return Res;

23509

}

23510

23511

if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {

23512

if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)

23513

return SDValue();

23514

23515

if (VT.isVector())

23516

return Op;

23517

23518

SDValue Res;

23519

SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,

23520

MVT::i32);

23521

if (IsStrict) {

23522

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,

23523

DAG.getConstantFP(0, DL, MVT::v4f32), In,

23524

DAG.getIntPtrConstant(0, DL));

23525

Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},

23526

{Chain, Res, Rnd});

23527

Chain = Res.getValue(1);

23528

} else {

23529

// FIXME: Should we use zeros for upper elements for non-strict?

23530

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);

23531

Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);

23532

}

23533

23534

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

23535

DAG.getIntPtrConstant(0, DL));

23536

Res = DAG.getBitcast(MVT::f16, Res);

23537

23538

if (IsStrict)

23539

return DAG.getMergeValues({Res, Chain}, DL);

23540

23541

return Res;

23542

}

23543

23544

return Op;

23545

}

23546

23547

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

23548

bool IsStrict = Op->isStrictFPOpcode();

23549

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23550

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23551, __extension__
__PRETTY_FUNCTION__))

23551

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23551, __extension__
__PRETTY_FUNCTION__));

23552

23553

SDLoc dl(Op);

23554

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

23555

DAG.getConstant(0, dl, MVT::v8i16), Src,

23556

DAG.getIntPtrConstant(0, dl));

23557

23558

SDValue Chain;

23559

if (IsStrict) {

23560

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

23561

{Op.getOperand(0), Res});

23562

Chain = Res.getValue(1);

23563

} else {

23564

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

23565

}

23566

23567

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

23568

DAG.getIntPtrConstant(0, dl));

23569

23570

if (IsStrict)

23571

return DAG.getMergeValues({Res, Chain}, dl);

23572

23573

return Res;

23574

}

23575

23576

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

23577

bool IsStrict = Op->isStrictFPOpcode();

23578

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23579

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23580, __extension__
__PRETTY_FUNCTION__))

23580

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23580, __extension__
__PRETTY_FUNCTION__));

23581

23582

SDLoc dl(Op);

23583

SDValue Res, Chain;

23584

if (IsStrict) {

23585

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

23586

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

23587

DAG.getIntPtrConstant(0, dl));

23588

Res = DAG.getNode(

23589

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

23590

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

23591

Chain = Res.getValue(1);

23592

} else {

23593

// FIXME: Should we use zeros for upper elements for non-strict?

23594

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

23595

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

23596

DAG.getTargetConstant(4, dl, MVT::i32));

23597

}

23598

23599

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

23600

DAG.getIntPtrConstant(0, dl));

23601

23602

if (IsStrict)

23603

return DAG.getMergeValues({Res, Chain}, dl);

23604

23605

return Res;

23606

}

23607

23608

SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,

23609

SelectionDAG &DAG) const {

23610

SDLoc DL(Op);

23611

MakeLibCallOptions CallOptions;

23612

RTLIB::Libcall LC =

23613

RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);

23614

SDValue Res =

23615

makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;

23616

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,

23617

DAG.getBitcast(MVT::i32, Res));

23618

}

23619

23620

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23621

/// vector operation in place of the typical scalar operation.

23622

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

23623

const X86Subtarget &Subtarget) {

23624

// If both operands have other uses, this is probably not profitable.

23625

SDValue LHS = Op.getOperand(0);

23626

SDValue RHS = Op.getOperand(1);

23627

if (!LHS.hasOneUse() && !RHS.hasOneUse())

23628

return Op;

23629

23630

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

23631

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

23632

if (IsFP && !Subtarget.hasSSE3())

23633

return Op;

23634

if (!IsFP && !Subtarget.hasSSSE3())

23635

return Op;

23636

23637

// Extract from a common vector.

23638

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23639

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23640

LHS.getOperand(0) != RHS.getOperand(0) ||

23641

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

23642

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

23643

!shouldUseHorizontalOp(true, DAG, Subtarget))

23644

return Op;

23645

23646

// Allow commuted 'hadd' ops.

23647

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

23648

unsigned HOpcode;

23649

switch (Op.getOpcode()) {

23650

case ISD::ADD: HOpcode = X86ISD::HADD; break;

23651

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

23652

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

23653

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

23654

default:

23655

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23655);

23656

}

23657

unsigned LExtIndex = LHS.getConstantOperandVal(1);

23658

unsigned RExtIndex = RHS.getConstantOperandVal(1);

23659

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

23660

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

23661

std::swap(LExtIndex, RExtIndex);

23662

23663

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

23664

return Op;

23665

23666

SDValue X = LHS.getOperand(0);

23667

EVT VecVT = X.getValueType();

23668

unsigned BitWidth = VecVT.getSizeInBits();

23669

unsigned NumLanes = BitWidth / 128;

23670

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

23671

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23672, __extension__
__PRETTY_FUNCTION__))

23672

"Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23672, __extension__
__PRETTY_FUNCTION__));

23673

23674

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

23675

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

23676

SDLoc DL(Op);

23677

if (BitWidth == 256 || BitWidth == 512) {

23678

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

23679

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

23680

LExtIndex %= NumEltsPerLane;

23681

}

23682

23683

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

23684

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

23685

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

23686

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

23687

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

23688

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

23689

DAG.getIntPtrConstant(LExtIndex / 2, DL));

23690

}

23691

23692

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23693

/// vector operation in place of the typical scalar operation.

23694

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

23695

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23696, __extension__
__PRETTY_FUNCTION__))

23696

"Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23696, __extension__
__PRETTY_FUNCTION__));

23697

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

23698

}

23699

23700

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

23701

/// This mode isn't supported in hardware on X86. But as long as we aren't

23702

/// compiling with trapping math, we can emulate this with

23703

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).

23704

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

23705

SDValue N0 = Op.getOperand(0);

23706

SDLoc dl(Op);

23707

MVT VT = Op.getSimpleValueType();

23708

23709

// N0 += copysign(nextafter(0.5, 0.0), N0)

23710

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23711

bool Ignored;

23712

APFloat Point5Pred = APFloat(0.5f);

23713

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

23714

Point5Pred.next(/*nextDown*/true);

23715

23716

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

23717

DAG.getConstantFP(Point5Pred, dl, VT), N0);

23718

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

23719

23720

// Truncate the result to remove fraction.

23721

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

23722

}

23723

23724

/// The only differences between FABS and FNEG are the mask and the logic op.

23725

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

23726

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

23727

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23728, __extension__
__PRETTY_FUNCTION__))

23728

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23728, __extension__
__PRETTY_FUNCTION__));

23729

23730

bool IsFABS = (Op.getOpcode() == ISD::FABS);

23731

23732

// If this is a FABS and it has an FNEG user, bail out to fold the combination

23733

// into an FNABS. We'll lower the FABS after that if it is still in use.

23734

if (IsFABS)

23735

for (SDNode *User : Op->uses())

23736

if (User->getOpcode() == ISD::FNEG)

23737

return Op;

23738

23739

SDLoc dl(Op);

23740

MVT VT = Op.getSimpleValueType();

23741

23742

bool IsF128 = (VT == MVT::f128);

23743

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23745, __extension__
__PRETTY_FUNCTION__))

23744

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23745, __extension__
__PRETTY_FUNCTION__))

23745

"Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23745, __extension__
__PRETTY_FUNCTION__));

23746

23747

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

23748

// decide if we should generate a 16-byte constant mask when we only need 4 or

23749

// 8 bytes for the scalar case.

23750

23751

// There are no scalar bitwise logical SSE/AVX instructions, so we

23752

// generate a 16-byte vector constant and logic op even for the scalar case.

23753

// Using a 16-byte mask allows folding the load of the mask with

23754

// the logic op, so it can save (~4 bytes) on code size.

23755

bool IsFakeVector = !VT.isVector() && !IsF128;

23756

MVT LogicVT = VT;

23757

if (IsFakeVector)

23758

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23759

: (VT == MVT::f32) ? MVT::v4f32

23760

: MVT::v8f16;

23761

23762

unsigned EltBits = VT.getScalarSizeInBits();

23763

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

23764

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

23765

APInt::getSignMask(EltBits);

23766

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23767

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

23768

23769

SDValue Op0 = Op.getOperand(0);

23770

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

23771

unsigned LogicOp = IsFABS ? X86ISD::FAND :

23772

IsFNABS ? X86ISD::FOR :

23773

X86ISD::FXOR;

23774

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

23775

23776

if (VT.isVector() || IsF128)

23777

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23778

23779

// For the scalar case extend to a 128-bit vector, perform the logic op,

23780

// and extract the scalar result back out.

23781

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

23782

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23783

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

23784

DAG.getIntPtrConstant(0, dl));

23785

}

23786

23787

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

23788

SDValue Mag = Op.getOperand(0);

23789

SDValue Sign = Op.getOperand(1);

23790

SDLoc dl(Op);

23791

23792

// If the sign operand is smaller, extend it first.

23793

MVT VT = Op.getSimpleValueType();

23794

if (Sign.getSimpleValueType().bitsLT(VT))

23795

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

23796

23797

// And if it is bigger, shrink it first.

23798

if (Sign.getSimpleValueType().bitsGT(VT))

23799

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,

23800

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

23801

23802

// At this point the operands and the result should have the same

23803

// type, and that won't be f80 since that is not custom lowered.

23804

bool IsF128 = (VT == MVT::f128);

23805

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23807, __extension__
__PRETTY_FUNCTION__))

23806

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23807, __extension__
__PRETTY_FUNCTION__))

23807

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23807, __extension__
__PRETTY_FUNCTION__));

23808

23809

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23810

23811

// Perform all scalar logic operations as 16-byte vectors because there are no

23812

// scalar FP logic instructions in SSE.

23813

// TODO: This isn't necessary. If we used scalar types, we might avoid some

23814

// unnecessary splats, but we might miss load folding opportunities. Should

23815

// this decision be based on OptimizeForSize?

23816

bool IsFakeVector = !VT.isVector() && !IsF128;

23817

MVT LogicVT = VT;

23818

if (IsFakeVector)

23819

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23820

: (VT == MVT::f32) ? MVT::v4f32

23821

: MVT::v8f16;

23822

23823

// The mask constants are automatically splatted for vector types.

23824

unsigned EltSizeInBits = VT.getScalarSizeInBits();

23825

SDValue SignMask = DAG.getConstantFP(

23826

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

23827

SDValue MagMask = DAG.getConstantFP(

23828

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

23829

23830

// First, clear all bits but the sign bit from the second operand (sign).

23831

if (IsFakeVector)

23832

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

23833

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

23834

23835

// Next, clear the sign bit from the first operand (magnitude).

23836

// TODO: If we had general constant folding for FP logic ops, this check

23837

// wouldn't be necessary.

23838

SDValue MagBits;

23839

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

23840

APFloat APF = Op0CN->getValueAPF();

23841

APF.clearSign();

23842

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

23843

} else {

23844

// If the magnitude operand wasn't a constant, we need to AND out the sign.

23845

if (IsFakeVector)

23846

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

23847

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

23848

}

23849

23850

// OR the magnitude value with the sign bit.

23851

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

23852

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

23853

DAG.getIntPtrConstant(0, dl));

23854

}

23855

23856

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

23857

SDValue N0 = Op.getOperand(0);

23858

SDLoc dl(Op);

23859

MVT VT = Op.getSimpleValueType();

23860

23861

MVT OpVT = N0.getSimpleValueType();

23862

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23863, __extension__
__PRETTY_FUNCTION__))

23863

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23863, __extension__
__PRETTY_FUNCTION__));

23864

23865

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

23866

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

23867

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

23868

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

23869

Res = DAG.getZExtOrTrunc(Res, dl, VT);

23870

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

23871

return Res;

23872

}

23873

23874

/// Helper for attempting to create a X86ISD::BT node.

23875

static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

23876

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

23877

// instruction. Since the shift amount is in-range-or-undefined, we know

23878

// that doing a bittest on the i32 value is ok. We extend to i32 because

23879

// the encoding for the i16 version is larger than the i32 version.

23880

// Also promote i16 to i32 for performance / code size reason.

23881

if (Src.getValueType().getScalarSizeInBits() < 32)

23882

Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

23883

23884

// No legal type found, give up.

23885

if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

23886

return SDValue();

23887

23888

// See if we can use the 32-bit instruction instead of the 64-bit one for a

23889

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

23890

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

23891

// known to be zero.

23892

if (Src.getValueType() == MVT::i64 &&

23893

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

23894

Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);

23895

23896

// If the operand types disagree, extend the shift amount to match. Since

23897

// BT ignores high bits (like shifts) we can use anyextend.

23898

if (Src.getValueType() != BitNo.getValueType()) {

23899

// Peek through a mask/modulo operation.

23900

// TODO: DAGCombine fails to do this as it just checks isTruncateFree, but

23901

// we probably need a better IsDesirableToPromoteOp to handle this as well.

23902

if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())

23903

BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),

23904

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

23905

BitNo.getOperand(0)),

23906

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

23907

BitNo.getOperand(1)));

23908

else

23909

BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

23910

}

23911

23912

return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

23913

}

23914

23915

/// Helper for creating a X86ISD::SETCC node.

23916

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

23917

SelectionDAG &DAG) {

23918

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

23919

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

23920

}

23921

23922

/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))

23923

/// style scalarized (associative) reduction patterns. Partial reductions

23924

/// are supported when the pointer SrcMask is non-null.

23925

/// TODO - move this to SelectionDAG?

23926

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

23927

SmallVectorImpl<SDValue> &SrcOps,

23928

SmallVectorImpl<APInt> *SrcMask = nullptr) {

23929

SmallVector<SDValue, 8> Opnds;

23930

DenseMap<SDValue, APInt> SrcOpMap;

23931

EVT VT = MVT::Other;

23932

23933

// Recognize a special case where a vector is casted into wide integer to

23934

// test all 0s.

23935

assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23936, __extension__
__PRETTY_FUNCTION__))

23936

"Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23936, __extension__
__PRETTY_FUNCTION__));

23937

Opnds.push_back(Op.getOperand(0));

23938

Opnds.push_back(Op.getOperand(1));

23939

23940

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

23941

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

23942

// BFS traverse all BinOp operands.

23943

if (I->getOpcode() == unsigned(BinOp)) {

23944

Opnds.push_back(I->getOperand(0));

23945

Opnds.push_back(I->getOperand(1));

23946

// Re-evaluate the number of nodes to be traversed.

23947

e += 2; // 2 more nodes (LHS and RHS) are pushed.

23948

continue;

23949

}

23950

23951

// Quit if a non-EXTRACT_VECTOR_ELT

23952

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

23953

return false;

23954

23955

// Quit if without a constant index.

23956

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

23957

if (!Idx)

23958

return false;

23959

23960

SDValue Src = I->getOperand(0);

23961

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

23962

if (M == SrcOpMap.end()) {

23963

VT = Src.getValueType();

23964

// Quit if not the same type.

23965

if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

23966

return false;

23967

unsigned NumElts = VT.getVectorNumElements();

23968

APInt EltCount = APInt::getZero(NumElts);

23969

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

23970

SrcOps.push_back(Src);

23971

}

23972

23973

// Quit if element already used.

23974

unsigned CIdx = Idx->getZExtValue();

23975

if (M->second[CIdx])

23976

return false;

23977

M->second.setBit(CIdx);

23978

}

23979

23980

if (SrcMask) {

23981

// Collect the source partial masks.

23982

for (SDValue &SrcOp : SrcOps)

23983

SrcMask->push_back(SrcOpMap[SrcOp]);

23984

} else {

23985

// Quit if not all elements are used.

23986

for (const auto &I : SrcOpMap)

23987

if (!I.second.isAllOnes())

23988

return false;

23989

}

23990

23991

return true;

23992

}

23993

23994

// Helper function for comparing all bits of a vector against zero.

23995

static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,

23996

const APInt &Mask,

23997

const X86Subtarget &Subtarget,

23998

SelectionDAG &DAG, X86::CondCode &X86CC) {

23999

EVT VT = V.getValueType();

24000

unsigned ScalarSize = VT.getScalarSizeInBits();

24001

if (Mask.getBitWidth() != ScalarSize) {

24002

assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24002, __extension__
__PRETTY_FUNCTION__));

24003

return SDValue();

24004

}

24005

24006

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24006, __extension__
__PRETTY_FUNCTION__));

24007

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

24008

24009

auto MaskBits = [&](SDValue Src) {

24010

if (Mask.isAllOnes())

24011

return Src;

24012

EVT SrcVT = Src.getValueType();

24013

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

24014

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

24015

};

24016

24017

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

24018

if (VT.getSizeInBits() < 128) {

24019

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

24020

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))

24021

return SDValue();

24022

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24023

DAG.getBitcast(IntVT, MaskBits(V)),

24024

DAG.getConstant(0, DL, IntVT));

24025

}

24026

24027

// Quit if not splittable to 128/256-bit vector.

24028

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24029

return SDValue();

24030

24031

// Split down to 128/256-bit vector.

24032

unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;

24033

while (VT.getSizeInBits() > TestSize) {

24034

auto Split = DAG.SplitVector(V, DL);

24035

VT = Split.first.getValueType();

24036

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

24037

}

24038

24039

bool UsePTEST = Subtarget.hasSSE41();

24040

if (UsePTEST) {

24041

MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

24042

V = DAG.getBitcast(TestVT, MaskBits(V));

24043

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

24044

}

24045

24046

// Without PTEST, a masked v2i64 or-reduction is not faster than

24047

// scalarization.

24048

if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)

24049

return SDValue();

24050

24051

V = DAG.getBitcast(MVT::v16i8, MaskBits(V));

24052

V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,

24053

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

24054

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24055

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24056

DAG.getConstant(0xFFFF, DL, MVT::i32));

24057

}

24058

24059

// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to

24060

// CMP(MOVMSK(PCMPEQB(X,0))).

24061

static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,

24062

const SDLoc &DL,

24063

const X86Subtarget &Subtarget,

24064

SelectionDAG &DAG, SDValue &X86CC) {

24065

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24065, __extension__
__PRETTY_FUNCTION__));

24066

24067

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

24068

return SDValue();

24069

24070

// Check whether we're masking/truncating an OR-reduction result, in which

24071

// case track the masked bits.

24072

APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

24073

switch (Op.getOpcode()) {

24074

case ISD::TRUNCATE: {

24075

SDValue Src = Op.getOperand(0);

24076

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

24077

Op.getScalarValueSizeInBits());

24078

Op = Src;

24079

break;

24080

}

24081

case ISD::AND: {

24082

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

24083

Mask = Cst->getAPIntValue();

24084

Op = Op.getOperand(0);

24085

}

24086

break;

24087

}

24088

}

24089

24090

SmallVector<SDValue, 8> VecIns;

24091

if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {

24092

EVT VT = VecIns[0].getValueType();

24093

assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24095, __extension__
__PRETTY_FUNCTION__))

24094

[VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24095, __extension__
__PRETTY_FUNCTION__))

24095

"Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24095, __extension__
__PRETTY_FUNCTION__));

24096

24097

// Quit if less than 128-bits or not splittable to 128/256-bit vector.

24098

if (VT.getSizeInBits() < 128 ||

24099

!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24100

return SDValue();

24101

24102

// If more than one full vector is evaluated, OR them first before PTEST.

24103

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

24104

Slot += 2, e += 1) {

24105

// Each iteration will OR 2 nodes and append the result until there is

24106

// only 1 node left, i.e. the final OR'd value of all vectors.

24107

SDValue LHS = VecIns[Slot];

24108

SDValue RHS = VecIns[Slot + 1];

24109

VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));

24110

}

24111

24112

X86::CondCode CCode;

24113

if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,

24114

DAG, CCode)) {

24115

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

24116

return V;

24117

}

24118

}

24119

24120

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

24121

ISD::NodeType BinOp;

24122

if (SDValue Match =

24123

DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {

24124

X86::CondCode CCode;

24125

if (SDValue V =

24126

LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {

24127

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

24128

return V;

24129

}

24130

}

24131

}

24132

24133

return SDValue();

24134

}

24135

24136

/// return true if \c Op has a use that doesn't just read flags.

24137

static bool hasNonFlagsUse(SDValue Op) {

24138

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

24139

++UI) {

24140

SDNode *User = *UI;

24141

unsigned UOpNo = UI.getOperandNo();

24142

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

24143

// Look pass truncate.

24144

UOpNo = User->use_begin().getOperandNo();

24145

User = *User->use_begin();

24146

}

24147

24148

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

24149

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

24150

return true;

24151

}

24152

return false;

24153

}

24154

24155

// Transform to an x86-specific ALU node with flags if there is a chance of

24156

// using an RMW op or only the flags are used. Otherwise, leave

24157

// the node alone and emit a 'cmp' or 'test' instruction.

24158

static bool isProfitableToUseFlagOp(SDValue Op) {

24159

for (SDNode *U : Op->uses())

24160

if (U->getOpcode() != ISD::CopyToReg &&

24161

U->getOpcode() != ISD::SETCC &&

24162

U->getOpcode() != ISD::STORE)

24163

return false;

24164

24165

return true;

24166

}

24167

24168

/// Emit nodes that will be selected as "test Op0,Op0", or something

24169

/// equivalent.

24170

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

24171

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

24172

// CF and OF aren't always set the way we want. Determine which

24173

// of these we need.

24174

bool NeedCF = false;

24175

bool NeedOF = false;

24176

switch (X86CC) {

24177

default: break;

24178

case X86::COND_A: case X86::COND_AE:

24179

case X86::COND_B: case X86::COND_BE:

24180

NeedCF = true;

24181

break;

24182

case X86::COND_G: case X86::COND_GE:

24183

case X86::COND_L: case X86::COND_LE:

24184

case X86::COND_O: case X86::COND_NO: {

24185

// Check if we really need to set the

24186

// Overflow flag. If NoSignedWrap is present

24187

// that is not actually needed.

24188

switch (Op->getOpcode()) {

24189

case ISD::ADD:

24190

case ISD::SUB:

24191

case ISD::MUL:

24192

case ISD::SHL:

24193

if (Op.getNode()->getFlags().hasNoSignedWrap())

24194

break;

24195

[[fallthrough]];

24196

default:

24197

NeedOF = true;

24198

break;

24199

}

24200

break;

24201

}

24202

}

24203

// See if we can use the EFLAGS value from the operand instead of

24204

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

24205

// we prove that the arithmetic won't overflow, we can't use OF or CF.

24206

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

24207

// Emit a CMP with 0, which is the TEST pattern.

24208

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24209

DAG.getConstant(0, dl, Op.getValueType()));

24210

}

24211

unsigned Opcode = 0;

24212

unsigned NumOperands = 0;

24213

24214

SDValue ArithOp = Op;

24215

24216

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

24217

// which may be the result of a CAST. We use the variable 'Op', which is the

24218

// non-casted variable when we check for possible users.

24219

switch (ArithOp.getOpcode()) {

24220

case ISD::AND:

24221

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

24222

// because a TEST instruction will be better.

24223

if (!hasNonFlagsUse(Op))

24224

break;

24225

24226

[[fallthrough]];

24227

case ISD::ADD:

24228

case ISD::SUB:

24229

case ISD::OR:

24230

case ISD::XOR:

24231

if (!isProfitableToUseFlagOp(Op))

24232

break;

24233

24234

// Otherwise use a regular EFLAGS-setting instruction.

24235

switch (ArithOp.getOpcode()) {

24236

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24236);

24237

case ISD::ADD: Opcode = X86ISD::ADD; break;

24238

case ISD::SUB: Opcode = X86ISD::SUB; break;

24239

case ISD::XOR: Opcode = X86ISD::XOR; break;

24240

case ISD::AND: Opcode = X86ISD::AND; break;

24241

case ISD::OR: Opcode = X86ISD::OR; break;

24242

}

24243

24244

NumOperands = 2;

24245

break;

24246

case X86ISD::ADD:

24247

case X86ISD::SUB:

24248

case X86ISD::OR:

24249

case X86ISD::XOR:

24250

case X86ISD::AND:

24251

return SDValue(Op.getNode(), 1);

24252

case ISD::SSUBO:

24253

case ISD::USUBO: {

24254

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

24255

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24256

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

24257

Op->getOperand(1)).getValue(1);

24258

}

24259

default:

24260

break;

24261

}

24262

24263

if (Opcode == 0) {

24264

// Emit a CMP with 0, which is the TEST pattern.

24265

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24266

DAG.getConstant(0, dl, Op.getValueType()));

24267

}

24268

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24269

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

24270

24271

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

24272

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

24273

return SDValue(New.getNode(), 1);

24274

}

24275

24276

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

24277

/// equivalent.

24278

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

24279

const SDLoc &dl, SelectionDAG &DAG,

24280

const X86Subtarget &Subtarget) {

24281

if (isNullConstant(Op1))

24282

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

24283

24284

EVT CmpVT = Op0.getValueType();

24285

24286

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24287, __extension__
__PRETTY_FUNCTION__))

24287

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24287, __extension__
__PRETTY_FUNCTION__));

24288

24289

// Only promote the compare up to I32 if it is a 16 bit operation

24290

// with an immediate. 16 bit immediates are to be avoided.

24291

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

24292

!DAG.getMachineFunction().getFunction().hasMinSize()) {

24293

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

24294

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

24295

// Don't do this if the immediate can fit in 8-bits.

24296

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

24297

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

24298

unsigned ExtendOp =

24299

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

24300

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

24301

// For equality comparisons try to use SIGN_EXTEND if the input was

24302

// truncate from something with enough sign bits.

24303

if (Op0.getOpcode() == ISD::TRUNCATE) {

24304

if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

24305

ExtendOp = ISD::SIGN_EXTEND;

24306

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

24307

if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

24308

ExtendOp = ISD::SIGN_EXTEND;

24309

}

24310

}

24311

24312

CmpVT = MVT::i32;

24313

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

24314

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

24315

}

24316

}

24317

24318

// Try to shrink i64 compares if the input has enough zero bits.

24319

// FIXME: Do this for non-constant compares for constant on LHS?

24320

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

24321

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

24322

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

24323

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

24324

CmpVT = MVT::i32;

24325

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

24326

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

24327

}

24328

24329

// 0-x == y --> x+y == 0

24330

// 0-x != y --> x+y != 0

24331

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

24332

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24333

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24334

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

24335

return Add.getValue(1);

24336

}

24337

24338

// x == 0-y --> x+y == 0

24339

// x != 0-y --> x+y != 0

24340

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

24341

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24342

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24343

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

24344

return Add.getValue(1);

24345

}

24346

24347

// Use SUB instead of CMP to enable CSE between SUB and CMP.

24348

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24349

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

24350

return Sub.getValue(1);

24351

}

24352

24353

/// Check if replacement of SQRT with RSQRT should be disabled.

24354

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

24355

EVT VT = Op.getValueType();

24356

24357

// We don't need to replace SQRT with RSQRT for half type.

24358

if (VT.getScalarType() == MVT::f16)

24359

return true;

24360

24361

// We never want to use both SQRT and RSQRT instructions for the same input.

24362

if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

24363

return false;

24364

24365

if (VT.isVector())

24366

return Subtarget.hasFastVectorFSQRT();

24367

return Subtarget.hasFastScalarFSQRT();

24368

}

24369

24370

/// The minimum architected relative accuracy is 2^-12. We need one

24371

/// Newton-Raphson step to have a good float result (24 bits of precision).

24372

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

24373

SelectionDAG &DAG, int Enabled,

24374

int &RefinementSteps,

24375

bool &UseOneConstNR,

24376

bool Reciprocal) const {

24377

SDLoc DL(Op);

24378

EVT VT = Op.getValueType();

24379

24380

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

24381

// It is likely not profitable to do this for f64 because a double-precision

24382

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

24383

// instructions: convert to single, rsqrtss, convert back to double, refine

24384

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

24385

// along with FMA, this could be a throughput win.

24386

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

24387

// after legalize types.

24388

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24389

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

24390

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

24391

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24392

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24393

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24394

RefinementSteps = 1;

24395

24396

UseOneConstNR = false;

24397

// There is no FSQRT for 512-bits, but there is RSQRT14.

24398

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

24399

SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

24400

if (RefinementSteps == 0 && !Reciprocal)

24401

Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

24402

return Estimate;

24403

}

24404

24405

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24406

Subtarget.hasFP16()) {

24407

assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24407, __extension__
__PRETTY_FUNCTION__));

24408

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24409

RefinementSteps = 0;

24410

24411

if (VT == MVT::f16) {

24412

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24413

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24414

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24415

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

24416

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24417

}

24418

24419

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

24420

}

24421

return SDValue();

24422

}

24423

24424

/// The minimum architected relative accuracy is 2^-12. We need one

24425

/// Newton-Raphson step to have a good float result (24 bits of precision).

24426

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

24427

int Enabled,

24428

int &RefinementSteps) const {

24429

SDLoc DL(Op);

24430

EVT VT = Op.getValueType();

24431

24432

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

24433

// It is likely not profitable to do this for f64 because a double-precision

24434

// reciprocal estimate with refinement on x86 prior to FMA requires

24435

// 15 instructions: convert to single, rcpss, convert back to double, refine

24436

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

24437

// along with FMA, this could be a throughput win.

24438

24439

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24440

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

24441

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24442

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24443

// Enable estimate codegen with 1 refinement step for vector division.

24444

// Scalar division estimates are disabled because they break too much

24445

// real-world code. These defaults are intended to match GCC behavior.

24446

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

24447

return SDValue();

24448

24449

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24450

RefinementSteps = 1;

24451

24452

// There is no FSQRT for 512-bits, but there is RCP14.

24453

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

24454

return DAG.getNode(Opcode, DL, VT, Op);

24455

}

24456

24457

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24458

Subtarget.hasFP16()) {

24459

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24460

RefinementSteps = 0;

24461

24462

if (VT == MVT::f16) {

24463

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24464

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24465

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24466

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

24467

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24468

}

24469

24470

return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

24471

}

24472

return SDValue();

24473

}

24474

24475

/// If we have at least two divisions that use the same divisor, convert to

24476

/// multiplication by a reciprocal. This may need to be adjusted for a given

24477

/// CPU if a division's cost is not at least twice the cost of a multiplication.

24478

/// This is because we still need one division to calculate the reciprocal and

24479

/// then we need two multiplies by that reciprocal as replacements for the

24480

/// original divisions.

24481

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

24482

return 2;

24483

}

24484

24485

SDValue

24486

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

24487

SelectionDAG &DAG,

24488

SmallVectorImpl<SDNode *> &Created) const {

24489

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

24490

if (isIntDivCheap(N->getValueType(0), Attr))

24491

return SDValue(N,0); // Lower SDIV as SDIV

24492

24493

assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24494, __extension__
__PRETTY_FUNCTION__))

24494

"Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24494, __extension__
__PRETTY_FUNCTION__));

24495

24496

// Only perform this transform if CMOV is supported otherwise the select

24497

// below will become a branch.

24498

if (!Subtarget.canUseCMOV())

24499

return SDValue();

24500

24501

// fold (sdiv X, pow2)

24502

EVT VT = N->getValueType(0);

24503

// FIXME: Support i8.

24504

if (VT != MVT::i16 && VT != MVT::i32 &&

24505

!(Subtarget.is64Bit() && VT == MVT::i64))

24506

return SDValue();

24507

24508

unsigned Lg2 = Divisor.countr_zero();

24509

24510

// If the divisor is 2 or -2, the default expansion is better.

24511

if (Lg2 == 1)

24512

return SDValue();

24513

24514

SDLoc DL(N);

24515

SDValue N0 = N->getOperand(0);

24516

SDValue Zero = DAG.getConstant(0, DL, VT);

24517

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

24518

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

24519

24520

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

24521

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

24522

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

24523

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

24524

24525

Created.push_back(Cmp.getNode());

24526

Created.push_back(Add.getNode());

24527

Created.push_back(CMov.getNode());

24528

24529

// Divide by pow2.

24530

SDValue SRA =

24531

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

24532

24533

// If we're dividing by a positive value, we're done. Otherwise, we must

24534

// negate the result.

24535

if (Divisor.isNonNegative())

24536

return SRA;

24537

24538

Created.push_back(SRA.getNode());

24539

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

24540

}

24541

24542

/// Result of 'and' is compared against zero. Change to a BT node if possible.

24543

/// Returns the BT node and the condition code needed to use it.

24544

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

24545

SelectionDAG &DAG, X86::CondCode &X86CC) {

24546

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24546, __extension__
__PRETTY_FUNCTION__));

24547

SDValue Op0 = And.getOperand(0);

24548

SDValue Op1 = And.getOperand(1);

24549

if (Op0.getOpcode() == ISD::TRUNCATE)

24550

Op0 = Op0.getOperand(0);

24551

if (Op1.getOpcode() == ISD::TRUNCATE)

24552

Op1 = Op1.getOperand(0);

24553

24554

SDValue Src, BitNo;

24555

if (Op1.getOpcode() == ISD::SHL)

24556

std::swap(Op0, Op1);

24557

if (Op0.getOpcode() == ISD::SHL) {

24558

if (isOneConstant(Op0.getOperand(0))) {

24559

// If we looked past a truncate, check that it's only truncating away

24560

// known zeros.

24561

unsigned BitWidth = Op0.getValueSizeInBits();

24562

unsigned AndBitWidth = And.getValueSizeInBits();

24563

if (BitWidth > AndBitWidth) {

24564

KnownBits Known = DAG.computeKnownBits(Op0);

24565

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

24566

return SDValue();

24567

}

24568

Src = Op1;

24569

BitNo = Op0.getOperand(1);

24570

}

24571

} else if (Op1.getOpcode() == ISD::Constant) {

24572

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

24573

uint64_t AndRHSVal = AndRHS->getZExtValue();

24574

SDValue AndLHS = Op0;

24575

24576

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

24577

Src = AndLHS.getOperand(0);

24578

BitNo = AndLHS.getOperand(1);

24579

} else {

24580

// Use BT if the immediate can't be encoded in a TEST instruction or we

24581

// are optimizing for size and the immedaite won't fit in a byte.

24582

bool OptForSize = DAG.shouldOptForSize();

24583

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

24584

isPowerOf2_64(AndRHSVal)) {

24585

Src = AndLHS;

24586

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

24587

Src.getValueType());

24588

}

24589

}

24590

}

24591

24592

// No patterns found, give up.

24593

if (!Src.getNode())

24594

return SDValue();

24595

24596

// Remove any bit flip.

24597

if (isBitwiseNot(Src)) {

24598

Src = Src.getOperand(0);

24599

CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

24600

}

24601

24602

// Attempt to create the X86ISD::BT node.

24603

if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

24604

X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

24605

return BT;

24606

}

24607

24608

return SDValue();

24609

}

24610

24611

// Check if pre-AVX condcode can be performed by a single FCMP op.

24612

static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

24613

return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

24614

}

24615

24616

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

24617

/// CMPs.

24618

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

24619

SDValue &Op1, bool &IsAlwaysSignaling) {

24620

unsigned SSECC;

24621

bool Swap = false;

24622

24623

// SSE Condition code mapping:

24624

// 0 - EQ

24625

// 1 - LT

24626

// 2 - LE

24627

// 3 - UNORD

24628

// 4 - NEQ

24629

// 5 - NLT

24630

// 6 - NLE

24631

// 7 - ORD

24632

switch (SetCCOpcode) {

24633

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24633);

24634

case ISD::SETOEQ:

24635

case ISD::SETEQ: SSECC = 0; break;

24636

case ISD::SETOGT:

24637

case ISD::SETGT: Swap = true; [[fallthrough]];

24638

case ISD::SETLT:

24639

case ISD::SETOLT: SSECC = 1; break;

24640

case ISD::SETOGE:

24641

case ISD::SETGE: Swap = true; [[fallthrough]];

24642

case ISD::SETLE:

24643

case ISD::SETOLE: SSECC = 2; break;

24644

case ISD::SETUO: SSECC = 3; break;

24645

case ISD::SETUNE:

24646

case ISD::SETNE: SSECC = 4; break;

24647

case ISD::SETULE: Swap = true; [[fallthrough]];

24648

case ISD::SETUGE: SSECC = 5; break;

24649

case ISD::SETULT: Swap = true; [[fallthrough]];

24650

case ISD::SETUGT: SSECC = 6; break;

24651

case ISD::SETO: SSECC = 7; break;

24652

case ISD::SETUEQ: SSECC = 8; break;

24653

case ISD::SETONE: SSECC = 12; break;

24654

}

24655

if (Swap)

24656

std::swap(Op0, Op1);

24657

24658

switch (SetCCOpcode) {

24659

default:

24660

IsAlwaysSignaling = true;

24661

break;

24662

case ISD::SETEQ:

24663

case ISD::SETOEQ:

24664

case ISD::SETUEQ:

24665

case ISD::SETNE:

24666

case ISD::SETONE:

24667

case ISD::SETUNE:

24668

case ISD::SETO:

24669

case ISD::SETUO:

24670

IsAlwaysSignaling = false;

24671

break;

24672

}

24673

24674

return SSECC;

24675

}

24676

24677

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

24678

/// concatenate the result back.

24679

static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,

24680

ISD::CondCode Cond, SelectionDAG &DAG,

24681

const SDLoc &dl) {

24682

assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24683, __extension__
__PRETTY_FUNCTION__))

24683

VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24683, __extension__
__PRETTY_FUNCTION__));

24684

24685

SDValue CC = DAG.getCondCode(Cond);

24686

24687

// Extract the LHS Lo/Hi vectors

24688

SDValue LHS1, LHS2;

24689

std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);

24690

24691

// Extract the RHS Lo/Hi vectors

24692

SDValue RHS1, RHS2;

24693

std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);

24694

24695

// Issue the operation on the smaller types and concatenate the result back

24696

EVT LoVT, HiVT;

24697

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

24698

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

24699

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

24700

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

24701

}

24702

24703

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

24704

24705

SDValue Op0 = Op.getOperand(0);

24706

SDValue Op1 = Op.getOperand(1);

24707

SDValue CC = Op.getOperand(2);

24708

MVT VT = Op.getSimpleValueType();

24709

SDLoc dl(Op);

24710

24711

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24712, __extension__
__PRETTY_FUNCTION__))

24712

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24712, __extension__
__PRETTY_FUNCTION__));

24713

24714

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

24715

24716

// Prefer SETGT over SETLT.

24717

if (SetCCOpcode == ISD::SETLT) {

24718

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

24719

std::swap(Op0, Op1);

24720

}

24721

24722

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

24723

}

24724

24725

/// Given a buildvector constant, return a new vector constant with each element

24726

/// incremented or decremented. If incrementing or decrementing would result in

24727

/// unsigned overflow or underflow or this is not a simple vector constant,

24728

/// return an empty value.

24729

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,

24730

bool NSW) {

24731

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

24732

if (!BV || !V.getValueType().isSimple())

24733

return SDValue();

24734

24735

MVT VT = V.getSimpleValueType();

24736

MVT EltVT = VT.getVectorElementType();

24737

unsigned NumElts = VT.getVectorNumElements();

24738

SmallVector<SDValue, 8> NewVecC;

24739

SDLoc DL(V);

24740

for (unsigned i = 0; i < NumElts; ++i) {

24741

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

24742

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

24743

return SDValue();

24744

24745

// Avoid overflow/underflow.

24746

const APInt &EltC = Elt->getAPIntValue();

24747

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

24748

return SDValue();

24749

if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||

24750

(!IsInc && EltC.isMinSignedValue())))

24751

return SDValue();

24752

24753

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

24754

}

24755

24756

return DAG.getBuildVector(VT, DL, NewVecC);

24757

}

24758

24759

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

24760

/// Op0 u<= Op1:

24761

/// t = psubus Op0, Op1

24762

/// pcmpeq t, <0..0>

24763

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

24764

ISD::CondCode Cond, const SDLoc &dl,

24765

const X86Subtarget &Subtarget,

24766

SelectionDAG &DAG) {

24767

if (!Subtarget.hasSSE2())

24768

return SDValue();

24769

24770

MVT VET = VT.getVectorElementType();

24771

if (VET != MVT::i8 && VET != MVT::i16)

24772

return SDValue();

24773

24774

switch (Cond) {

24775

default:

24776

return SDValue();

24777

case ISD::SETULT: {

24778

// If the comparison is against a constant we can turn this into a

24779

// setule. With psubus, setule does not require a swap. This is

24780

// beneficial because the constant in the register is no longer

24781

// destructed as the destination so it can be hoisted out of a loop.

24782

// Only do this pre-AVX since vpcmp* is no longer destructive.

24783

if (Subtarget.hasAVX())

24784

return SDValue();

24785

SDValue ULEOp1 =

24786

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);

24787

if (!ULEOp1)

24788

return SDValue();

24789

Op1 = ULEOp1;

24790

break;

24791

}

24792

case ISD::SETUGT: {

24793

// If the comparison is against a constant, we can turn this into a setuge.

24794

// This is beneficial because materializing a constant 0 for the PCMPEQ is

24795

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

24796

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

24797

SDValue UGEOp1 =

24798

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);

24799

if (!UGEOp1)

24800

return SDValue();

24801

Op1 = Op0;

24802

Op0 = UGEOp1;

24803

break;

24804

}

24805

// Psubus is better than flip-sign because it requires no inversion.

24806

case ISD::SETUGE:

24807

std::swap(Op0, Op1);

24808

break;

24809

case ISD::SETULE:

24810

break;

24811

}

24812

24813

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

24814

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

24815

DAG.getConstant(0, dl, VT));

24816

}

24817

24818

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

24819

SelectionDAG &DAG) {

24820

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

24821

Op.getOpcode() == ISD::STRICT_FSETCCS;

24822

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

24823

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

24824

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

24825

MVT VT = Op->getSimpleValueType(0);

24826

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

24827

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

24828

SDLoc dl(Op);

24829

24830

if (isFP) {

24831

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

24832

assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24832, __extension__
__PRETTY_FUNCTION__));

24833

if (isSoftFP16(EltVT, Subtarget))

24834

return SDValue();

24835

24836

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

24837

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

24838

24839

// If we have a strict compare with a vXi1 result and the input is 128/256

24840

// bits we can't use a masked compare unless we have VLX. If we use a wider

24841

// compare like we do for non-strict, we might trigger spurious exceptions

24842

// from the upper elements. Instead emit a AVX compare and convert to mask.

24843

unsigned Opc;

24844

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

24845

(!IsStrict || Subtarget.hasVLX() ||

24846

Op0.getSimpleValueType().is512BitVector())) {

24847

#ifndef NDEBUG

24848

unsigned Num = VT.getVectorNumElements();

24849

assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24849, __extension__
__PRETTY_FUNCTION__));

24850

#endif

24851

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

24852

} else {

24853

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

24854

// The SSE/AVX packed FP comparison nodes are defined with a

24855

// floating-point vector result that matches the operand type. This allows

24856

// them to work with an SSE1 target (integer vector types are not legal).

24857

VT = Op0.getSimpleValueType();

24858

}

24859

24860

SDValue Cmp;

24861

bool IsAlwaysSignaling;

24862

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

24863

if (!Subtarget.hasAVX()) {

24864

// TODO: We could use following steps to handle a quiet compare with

24865

// signaling encodings.

24866

// 1. Get ordered masks from a quiet ISD::SETO

24867

// 2. Use the masks to mask potential unordered elements in operand A, B

24868

// 3. Get the compare results of masked A, B

24869

// 4. Calculating final result using the mask and result from 3

24870

// But currently, we just fall back to scalar operations.

24871

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

24872

return SDValue();

24873

24874

// Insert an extra signaling instruction to raise exception.

24875

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

24876

SDValue SignalCmp = DAG.getNode(

24877

Opc, dl, {VT, MVT::Other},

24878

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

24879

// FIXME: It seems we need to update the flags of all new strict nodes.

24880

// Otherwise, mayRaiseFPException in MI will return false due to

24881

// NoFPExcept = false by default. However, I didn't find it in other

24882

// patches.

24883

SignalCmp->setFlags(Op->getFlags());

24884

Chain = SignalCmp.getValue(1);

24885

}

24886

24887

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

24888

// emit two comparisons and a logic op to tie them together.

24889

if (!cheapX86FSETCC_SSE(Cond)) {

24890

// LLVM predicate is SETUEQ or SETONE.

24891

unsigned CC0, CC1;

24892

unsigned CombineOpc;

24893

if (Cond == ISD::SETUEQ) {

24894

CC0 = 3; // UNORD

24895

CC1 = 0; // EQ

24896

CombineOpc = X86ISD::FOR;

24897

} else {

24898

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24898, __extension__ __PRETTY_FUNCTION__));

24899

CC0 = 7; // ORD

24900

CC1 = 4; // NEQ

24901

CombineOpc = X86ISD::FAND;

24902

}

24903

24904

SDValue Cmp0, Cmp1;

24905

if (IsStrict) {

24906

Cmp0 = DAG.getNode(

24907

Opc, dl, {VT, MVT::Other},

24908

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

24909

Cmp1 = DAG.getNode(

24910

Opc, dl, {VT, MVT::Other},

24911

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

24912

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

24913

Cmp1.getValue(1));

24914

} else {

24915

Cmp0 = DAG.getNode(

24916

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

24917

Cmp1 = DAG.getNode(

24918

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

24919

}

24920

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

24921

} else {

24922

if (IsStrict) {

24923

Cmp = DAG.getNode(

24924

Opc, dl, {VT, MVT::Other},

24925

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

24926

Chain = Cmp.getValue(1);

24927

} else

24928

Cmp = DAG.getNode(

24929

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

24930

}

24931

} else {

24932

// Handle all other FP comparisons here.

24933

if (IsStrict) {

24934

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

24935

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

24936

Cmp = DAG.getNode(

24937

Opc, dl, {VT, MVT::Other},

24938

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

24939

Chain = Cmp.getValue(1);

24940

} else

24941

Cmp = DAG.getNode(

24942

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

24943

}

24944

24945

if (VT.getFixedSizeInBits() >

24946

Op.getSimpleValueType().getFixedSizeInBits()) {

24947

// We emitted a compare with an XMM/YMM result. Finish converting to a

24948

// mask register using a vptestm.

24949

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

24950

Cmp = DAG.getBitcast(CastVT, Cmp);

24951

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

24952

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

24953

} else {

24954

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

24955

// the result type of SETCC. The bitcast is expected to be optimized

24956

// away during combining/isel.

24957

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

24958

}

24959

24960

if (IsStrict)

24961

return DAG.getMergeValues({Cmp, Chain}, dl);

24962

24963

return Cmp;

24964

}

24965

24966

assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24966, __extension__
__PRETTY_FUNCTION__));

24967

24968

MVT VTOp0 = Op0.getSimpleValueType();

24969

(void)VTOp0;

24970

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24971, __extension__
__PRETTY_FUNCTION__))

24971

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24971, __extension__
__PRETTY_FUNCTION__));

24972

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24973, __extension__
__PRETTY_FUNCTION__))

24973

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24973, __extension__
__PRETTY_FUNCTION__));

24974

24975

// The non-AVX512 code below works under the assumption that source and

24976

// destination types are the same.

24977

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24978, __extension__
__PRETTY_FUNCTION__))

24978

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24978, __extension__
__PRETTY_FUNCTION__));

24979

24980

// The result is boolean, but operands are int/float

24981

if (VT.getVectorElementType() == MVT::i1) {

24982

// In AVX-512 architecture setcc returns mask with i1 elements,

24983

// But there is no compare instruction for i8 and i16 elements in KNL.

24984

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24985, __extension__
__PRETTY_FUNCTION__))

24985

"Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24985, __extension__
__PRETTY_FUNCTION__));

24986

return LowerIntVSETCC_AVX512(Op, DAG);

24987

}

24988

24989

// Lower using XOP integer comparisons.

24990

if (VT.is128BitVector() && Subtarget.hasXOP()) {

24991

// Translate compare code to XOP PCOM compare mode.

24992

unsigned CmpMode = 0;

24993

switch (Cond) {

24994

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24994);

24995

case ISD::SETULT:

24996

case ISD::SETLT: CmpMode = 0x00; break;

24997

case ISD::SETULE:

24998

case ISD::SETLE: CmpMode = 0x01; break;

24999

case ISD::SETUGT:

25000

case ISD::SETGT: CmpMode = 0x02; break;

25001

case ISD::SETUGE:

25002

case ISD::SETGE: CmpMode = 0x03; break;

25003

case ISD::SETEQ: CmpMode = 0x04; break;

25004

case ISD::SETNE: CmpMode = 0x05; break;

25005

}

25006

25007

// Are we comparing unsigned or signed integers?

25008

unsigned Opc =

25009

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

25010

25011

return DAG.getNode(Opc, dl, VT, Op0, Op1,

25012

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

25013

}

25014

25015

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

25016

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

25017

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

25018

SDValue BC0 = peekThroughBitcasts(Op0);

25019

if (BC0.getOpcode() == ISD::AND) {

25020

APInt UndefElts;

25021

SmallVector<APInt, 64> EltBits;

25022

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

25023

VT.getScalarSizeInBits(), UndefElts,

25024

EltBits, false, false)) {

25025

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

25026

Cond = ISD::SETEQ;

25027

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

25028

}

25029

}

25030

}

25031

}

25032

25033

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

25034

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

25035

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

25036

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

25037

if (C1 && C1->getAPIntValue().isPowerOf2()) {

25038

unsigned BitWidth = VT.getScalarSizeInBits();

25039

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

25040

25041

SDValue Result = Op0.getOperand(0);

25042

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

25043

DAG.getConstant(ShiftAmt, dl, VT));

25044

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

25045

DAG.getConstant(BitWidth - 1, dl, VT));

25046

return Result;

25047

}

25048

}

25049

25050

// Break 256-bit integer vector compare into smaller ones.

25051

if (VT.is256BitVector() && !Subtarget.hasInt256())

25052

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25053

25054

// Break 512-bit integer vector compare into smaller ones.

25055

// TODO: Try harder to use VPCMPx + VPMOV2x?

25056

if (VT.is512BitVector())

25057

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25058

25059

// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

25060

// not-of-PCMPEQ:

25061

// X != INT_MIN --> X >s INT_MIN

25062

// X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

25063

// +X != 0 --> +X >s 0

25064

APInt ConstValue;

25065

if (Cond == ISD::SETNE &&

25066

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

25067

if (ConstValue.isMinSignedValue())

25068

Cond = ISD::SETGT;

25069

else if (ConstValue.isMaxSignedValue())

25070

Cond = ISD::SETLT;

25071

else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

25072

Cond = ISD::SETGT;

25073

}

25074

25075

// If both operands are known non-negative, then an unsigned compare is the

25076

// same as a signed compare and there's no need to flip signbits.

25077

// TODO: We could check for more general simplifications here since we're

25078

// computing known bits.

25079

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

25080

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

25081

25082

// Special case: Use min/max operations for unsigned compares.

25083

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25084

if (ISD::isUnsignedIntSetCC(Cond) &&

25085

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

25086

TLI.isOperationLegal(ISD::UMIN, VT)) {

25087

// If we have a constant operand, increment/decrement it and change the

25088

// condition to avoid an invert.

25089

if (Cond == ISD::SETUGT) {

25090

// X > C --> X >= (C+1) --> X == umax(X, C+1)

25091

if (SDValue UGTOp1 =

25092

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {

25093

Op1 = UGTOp1;

25094

Cond = ISD::SETUGE;

25095

}

25096

}

25097

if (Cond == ISD::SETULT) {

25098

// X < C --> X <= (C-1) --> X == umin(X, C-1)

25099

if (SDValue ULTOp1 =

25100

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {

25101

Op1 = ULTOp1;

25102

Cond = ISD::SETULE;

25103

}

25104

}

25105

bool Invert = false;

25106

unsigned Opc;

25107

switch (Cond) {

25108

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25108);

25109

case ISD::SETUGT: Invert = true; [[fallthrough]];

25110

case ISD::SETULE: Opc = ISD::UMIN; break;

25111

case ISD::SETULT: Invert = true; [[fallthrough]];

25112

case ISD::SETUGE: Opc = ISD::UMAX; break;

25113

}

25114

25115

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25116

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

25117

25118

// If the logical-not of the result is required, perform that now.

25119

if (Invert)

25120

Result = DAG.getNOT(dl, Result, VT);

25121

25122

return Result;

25123

}

25124

25125

// Try to use SUBUS and PCMPEQ.

25126

if (FlipSigns)

25127

if (SDValue V =

25128

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

25129

return V;

25130

25131

// We are handling one of the integer comparisons here. Since SSE only has

25132

// GT and EQ comparisons for integer, swapping operands and multiple

25133

// operations may be required for some comparisons.

25134

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

25135

: X86ISD::PCMPGT;

25136

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

25137

Cond == ISD::SETGE || Cond == ISD::SETUGE;

25138

bool Invert = Cond == ISD::SETNE ||

25139

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

25140

25141

if (Swap)

25142

std::swap(Op0, Op1);

25143

25144

// Check that the operation in question is available (most are plain SSE2,

25145

// but PCMPGTQ and PCMPEQQ have different requirements).

25146

if (VT == MVT::v2i64) {

25147

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

25148

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25148, __extension__
__PRETTY_FUNCTION__));

25149

25150

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

25151

// the odd elements over the even elements.

25152

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

25153

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

25154

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25155

25156

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25157

static const int MaskHi[] = { 1, 1, 3, 3 };

25158

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25159

25160

return DAG.getBitcast(VT, Result);

25161

}

25162

25163

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

25164

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25165

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

25166

25167

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25168

static const int MaskHi[] = { 1, 1, 3, 3 };

25169

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25170

25171

return DAG.getBitcast(VT, Result);

25172

}

25173

25174

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25175

// bits of the inputs before performing those operations. The lower

25176

// compare is always unsigned.

25177

SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL

25178

: 0x0000000080000000ULL,

25179

dl, MVT::v2i64);

25180

25181

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

25182

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

25183

25184

// Cast everything to the right type.

25185

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25186

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25187

25188

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

25189

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25190

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

25191

25192

// Create masks for only the low parts/high parts of the 64 bit integers.

25193

static const int MaskHi[] = { 1, 1, 3, 3 };

25194

static const int MaskLo[] = { 0, 0, 2, 2 };

25195

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

25196

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

25197

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25198

25199

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

25200

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

25201

25202

if (Invert)

25203

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25204

25205

return DAG.getBitcast(VT, Result);

25206

}

25207

25208

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

25209

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

25210

// pcmpeqd + pshufd + pand.

25211

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25211, __extension__
__PRETTY_FUNCTION__));

25212

25213

// First cast everything to the right type.

25214

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25215

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25216

25217

// Do the compare.

25218

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

25219

25220

// Make sure the lower and upper halves are both all-ones.

25221

static const int Mask[] = { 1, 0, 3, 2 };

25222

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

25223

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

25224

25225

if (Invert)

25226

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25227

25228

return DAG.getBitcast(VT, Result);

25229

}

25230

}

25231

25232

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25233

// bits of the inputs before performing those operations.

25234

if (FlipSigns) {

25235

MVT EltVT = VT.getVectorElementType();

25236

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

25237

VT);

25238

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

25239

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

25240

}

25241

25242

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25243

25244

// If the logical-not of the result is required, perform that now.

25245

if (Invert)

25246

Result = DAG.getNOT(dl, Result, VT);

25247

25248

return Result;

25249

}

25250

25251

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

25252

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

25253

const SDLoc &dl, SelectionDAG &DAG,

25254

const X86Subtarget &Subtarget,

25255

SDValue &X86CC) {

25256

// Only support equality comparisons.

25257

if (CC != ISD::SETEQ && CC != ISD::SETNE)

25258

return SDValue();

25259

25260

// Must be a bitcast from vXi1.

25261

if (Op0.getOpcode() != ISD::BITCAST)

25262

return SDValue();

25263

25264

Op0 = Op0.getOperand(0);

25265

MVT VT = Op0.getSimpleValueType();

25266

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

25267

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

25268

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

25269

return SDValue();

25270

25271

X86::CondCode X86Cond;

25272

if (isNullConstant(Op1)) {

25273

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

25274

} else if (isAllOnesConstant(Op1)) {

25275

// C flag is set for all ones.

25276

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

25277

} else

25278

return SDValue();

25279

25280

// If the input is an AND, we can combine it's operands into the KTEST.

25281

bool KTestable = false;

25282

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

25283

KTestable = true;

25284

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

25285

KTestable = true;

25286

if (!isNullConstant(Op1))

25287

KTestable = false;

25288

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

25289

SDValue LHS = Op0.getOperand(0);

25290

SDValue RHS = Op0.getOperand(1);

25291

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25292

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

25293

}

25294

25295

// If the input is an OR, we can combine it's operands into the KORTEST.

25296

SDValue LHS = Op0;

25297

SDValue RHS = Op0;

25298

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

25299

LHS = Op0.getOperand(0);

25300

RHS = Op0.getOperand(1);

25301

}

25302

25303

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25304

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

25305

}

25306

25307

/// Emit flags for the given setcc condition and operands. Also returns the

25308

/// corresponding X86 condition code constant in X86CC.

25309

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

25310

ISD::CondCode CC, const SDLoc &dl,

25311

SelectionDAG &DAG,

25312

SDValue &X86CC) const {

25313

// Optimize to BT if possible.

25314

// Lower (X & (1 << N)) == 0 to BT(X, N).

25315

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

25316

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

25317

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&

25318

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

25319

X86::CondCode X86CondCode;

25320

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

25321

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25322

return BT;

25323

}

25324

}

25325

25326

// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.

25327

// TODO: We could do AND tree with all 1s as well by using the C flag.

25328

if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))

25329

if (SDValue CmpZ =

25330

MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))

25331

return CmpZ;

25332

25333

// Try to lower using KORTEST or KTEST.

25334

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

25335

return Test;

25336

25337

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of

25338

// these.

25339

if ((isOneConstant(Op1) || isNullConstant(Op1)) &&

25340

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

25341

// If the input is a setcc, then reuse the input setcc or use a new one with

25342

// the inverted condition.

25343

if (Op0.getOpcode() == X86ISD::SETCC) {

25344

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

25345

25346

X86CC = Op0.getOperand(0);

25347

if (Invert) {

25348

X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);

25349

CCode = X86::GetOppositeBranchCondition(CCode);

25350

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

25351

}

25352

25353

return Op0.getOperand(1);

25354

}

25355

}

25356

25357

// Try to use the carry flag from the add in place of an separate CMP for:

25358

// (seteq (add X, -1), -1). Similar for setne.

25359

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

25360

Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

25361

if (isProfitableToUseFlagOp(Op0)) {

25362

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

25363

25364

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

25365

Op0.getOperand(1));

25366

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

25367

X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25368

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

25369

return SDValue(New.getNode(), 1);

25370

}

25371

}

25372

25373

X86::CondCode CondCode =

25374

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

25375

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25375, __extension__
__PRETTY_FUNCTION__));

25376

25377

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

25378

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25379

return EFLAGS;

25380

}

25381

25382

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

25383

25384

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25385

Op.getOpcode() == ISD::STRICT_FSETCCS;

25386

MVT VT = Op->getSimpleValueType(0);

25387

25388

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

25389

25390

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25390, __extension__
__PRETTY_FUNCTION__));

25391

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25392

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25393

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25394

SDLoc dl(Op);

25395

ISD::CondCode CC =

25396

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

25397

25398

if (isSoftFP16(Op0.getValueType()))

25399

return SDValue();

25400

25401

// Handle f128 first, since one possible outcome is a normal integer

25402

// comparison which gets handled by emitFlagsForSetcc.

25403

if (Op0.getValueType() == MVT::f128) {

25404

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

25405

Op.getOpcode() == ISD::STRICT_FSETCCS);

25406

25407

// If softenSetCCOperands returned a scalar, use it.

25408

if (!Op1.getNode()) {

25409

assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25410, __extension__
__PRETTY_FUNCTION__))

25410

"Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25410, __extension__
__PRETTY_FUNCTION__));

25411

if (IsStrict)

25412

return DAG.getMergeValues({Op0, Chain}, dl);

25413

return Op0;

25414

}

25415

}

25416

25417

if (Op0.getSimpleValueType().isInteger()) {

25418

// Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

25419

// reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

25420

// this may translate to less uops depending on uarch implementation. The

25421

// equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

25422

// canonicalize to that CondCode.

25423

// NOTE: Only do this if incrementing the constant doesn't increase the bit

25424

// encoding size - so it must either already be a i8 or i32 immediate, or it

25425

// shrinks down to that. We don't do this for any i64's to avoid additional

25426

// constant materializations.

25427

// TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

25428

if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

25429

const APInt &Op1Val = Op1C->getAPIntValue();

25430

if (!Op1Val.isZero()) {

25431

// Ensure the constant+1 doesn't overflow.

25432

if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

25433

(CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

25434

APInt Op1ValPlusOne = Op1Val + 1;

25435

if (Op1ValPlusOne.isSignedIntN(32) &&

25436

(!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

25437

Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

25438

CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

25439

: ISD::CondCode::SETUGE;

25440

}

25441

}

25442

}

25443

}

25444

25445

SDValue X86CC;

25446

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

25447

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25448

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25449

}

25450

25451

// Handle floating point.

25452

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

25453

if (CondCode == X86::COND_INVALID)

25454

return SDValue();

25455

25456

SDValue EFLAGS;

25457

if (IsStrict) {

25458

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25459

EFLAGS =

25460

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

25461

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

25462

Chain = EFLAGS.getValue(1);

25463

} else {

25464

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

25465

}

25466

25467

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25468

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25469

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25470

}

25471

25472

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

25473

SDValue LHS = Op.getOperand(0);

25474

SDValue RHS = Op.getOperand(1);

25475

SDValue Carry = Op.getOperand(2);

25476

SDValue Cond = Op.getOperand(3);

25477

SDLoc DL(Op);

25478

25479

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25479, __extension__
__PRETTY_FUNCTION__));

25480

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

25481

25482

// Recreate the carry if needed.

25483

EVT CarryVT = Carry.getValueType();

25484

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

25485

Carry, DAG.getAllOnesConstant(DL, CarryVT));

25486

25487

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

25488

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

25489

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

25490

}

25491

25492

// This function returns three things: the arithmetic computation itself

25493

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

25494

// flag and the condition code define the case in which the arithmetic

25495

// computation overflows.

25496

static std::pair<SDValue, SDValue>

25497

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

25498

assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25498, __extension__
__PRETTY_FUNCTION__));

25499

SDValue Value, Overflow;

25500

SDValue LHS = Op.getOperand(0);

25501

SDValue RHS = Op.getOperand(1);

25502

unsigned BaseOp = 0;

25503

SDLoc DL(Op);

25504

switch (Op.getOpcode()) {

25505

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 25505);

25506

case ISD::SADDO:

25507

BaseOp = X86ISD::ADD;

25508

Cond = X86::COND_O;

25509

break;

25510

case ISD::UADDO:

25511

BaseOp = X86ISD::ADD;

25512

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

25513

break;

25514

case ISD::SSUBO:

25515

BaseOp = X86ISD::SUB;

25516

Cond = X86::COND_O;

25517

break;

25518

case ISD::USUBO:

25519

BaseOp = X86ISD::SUB;

25520

Cond = X86::COND_B;

25521

break;

25522

case ISD::SMULO:

25523

BaseOp = X86ISD::SMUL;

25524

Cond = X86::COND_O;

25525

break;

25526

case ISD::UMULO:

25527

BaseOp = X86ISD::UMUL;

25528

Cond = X86::COND_O;

25529

break;

25530

}

25531

25532

if (BaseOp) {

25533

// Also sets EFLAGS.

25534

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

25535

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

25536

Overflow = Value.getValue(1);

25537

}

25538

25539

return std::make_pair(Value, Overflow);

25540

}

25541

25542

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

25543

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

25544

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

25545

// looks for this combo and may remove the "setcc" instruction if the "setcc"

25546

// has only one use.

25547

SDLoc DL(Op);

25548

X86::CondCode Cond;

25549

SDValue Value, Overflow;

25550

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

25551

25552

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

25553

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25553, __extension__
__PRETTY_FUNCTION__));

25554

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

25555

}

25556

25557

/// Return true if opcode is a X86 logical comparison.

25558

static bool isX86LogicalCmp(SDValue Op) {

25559

unsigned Opc = Op.getOpcode();

25560

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

25561

Opc == X86ISD::FCMP)

25562

return true;

25563

if (Op.getResNo() == 1 &&

25564

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

25565

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

25566

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

25567

return true;

25568

25569

return false;

25570

}

25571

25572

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

25573

if (V.getOpcode() != ISD::TRUNCATE)

25574

return false;

25575

25576

SDValue VOp0 = V.getOperand(0);

25577

unsigned InBits = VOp0.getValueSizeInBits();

25578

unsigned Bits = V.getValueSizeInBits();

25579

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

25580

}

25581

25582

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

25583

bool AddTest = true;

25584

SDValue Cond = Op.getOperand(0);

25585

SDValue Op1 = Op.getOperand(1);

25586

SDValue Op2 = Op.getOperand(2);

25587

SDLoc DL(Op);

25588

MVT VT = Op1.getSimpleValueType();

25589

SDValue CC;

25590

25591

if (isSoftFP16(VT)) {

25592

MVT NVT = VT.changeTypeToInteger();

25593

return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,

25594

DAG.getBitcast(NVT, Op1),

25595

DAG.getBitcast(NVT, Op2)));

25596

}

25597

25598

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

25599

// are available or VBLENDV if AVX is available.

25600

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

25601

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

25602

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

25603

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

25604

bool IsAlwaysSignaling;

25605

unsigned SSECC =

25606

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

25607

CondOp0, CondOp1, IsAlwaysSignaling);

25608

25609

if (Subtarget.hasAVX512()) {

25610

SDValue Cmp =

25611

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

25612

DAG.getTargetConstant(SSECC, DL, MVT::i8));

25613

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25613, __extension__
__PRETTY_FUNCTION__));

25614

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

25615

}

25616

25617

if (SSECC < 8 || Subtarget.hasAVX()) {

25618

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

25619

DAG.getTargetConstant(SSECC, DL, MVT::i8));

25620

25621

// If we have AVX, we can use a variable vector select (VBLENDV) instead

25622

// of 3 logic instructions for size savings and potentially speed.

25623

// Unfortunately, there is no scalar form of VBLENDV.

25624

25625

// If either operand is a +0.0 constant, don't try this. We can expect to

25626

// optimize away at least one of the logic instructions later in that

25627

// case, so that sequence would be faster than a variable blend.

25628

25629

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

25630

// uses XMM0 as the selection register. That may need just as many

25631

// instructions as the AND/ANDN/OR sequence due to register moves, so

25632

// don't bother.

25633

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

25634

!isNullFPConstant(Op2)) {

25635

// Convert to vectors, do a VSELECT, and convert back to scalar.

25636

// All of the conversions should be optimized away.

25637

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

25638

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

25639

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

25640

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

25641

25642

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

25643

VCmp = DAG.getBitcast(VCmpVT, VCmp);

25644

25645

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

25646

25647

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

25648

VSel, DAG.getIntPtrConstant(0, DL));

25649

}

25650

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

25651

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

25652

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

25653

}

25654

}

25655

25656

// AVX512 fallback is to lower selects of scalar floats to masked moves.

25657

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

25658

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

25659

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

25660

}

25661

25662

if (Cond.getOpcode() == ISD::SETCC &&

25663

!isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {

25664

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

25665

Cond = NewCond;

25666

// If the condition was updated, it's possible that the operands of the

25667

// select were also updated (for example, EmitTest has a RAUW). Refresh

25668

// the local references to the select operands in case they got stale.

25669

Op1 = Op.getOperand(1);

25670

Op2 = Op.getOperand(2);

25671

}

25672

}

25673

25674

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

25675

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

25676

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

25677

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

25678

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

25679

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

25680

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

25681

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

25682

if (Cond.getOpcode() == X86ISD::SETCC &&

25683

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

25684

isNullConstant(Cond.getOperand(1).getOperand(1))) {

25685

SDValue Cmp = Cond.getOperand(1);

25686

SDValue CmpOp0 = Cmp.getOperand(0);

25687

unsigned CondCode = Cond.getConstantOperandVal(0);

25688

25689

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

25690

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

25691

// handle to keep the CMP with 0. This should be removed by

25692

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

25693

// cttz_zero_undef.

25694

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

25695

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

25696

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

25697

};

25698

if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

25699

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

25700

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

25701

// Keep Cmp.

25702

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

25703

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

25704

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

25705

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

25706

25707

// 'X - 1' sets the carry flag if X == 0.

25708

// '0 - X' sets the carry flag if X != 0.

25709

// Convert the carry flag to a -1/0 mask with sbb:

25710

// select (X != 0), -1, Y --> 0 - X; or (sbb), Y

25711

// select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

25712

// select (X != 0), Y, -1 --> X - 1; or (sbb), Y

25713

// select (X == 0), -1, Y --> X - 1; or (sbb), Y

25714

SDValue Sub;

25715

if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {

25716

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

25717

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

25718

} else {

25719

SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());

25720

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);

25721

}

25722

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

25723

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

25724

Sub.getValue(1));

25725

return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

25726

} else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&

25727

CmpOp0.getOpcode() == ISD::AND &&

25728

isOneConstant(CmpOp0.getOperand(1))) {

25729

SDValue Src1, Src2;

25730

// true if Op2 is XOR or OR operator and one of its operands

25731

// is equal to Op1

25732

// ( a , a op b) || ( b , a op b)

25733

auto isOrXorPattern = [&]() {

25734

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

25735

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

25736

Src1 =

25737

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

25738

Src2 = Op1;

25739

return true;

25740

}

25741

return false;

25742

};

25743

25744

if (isOrXorPattern()) {

25745

SDValue Neg;

25746

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

25747

// we need mask of all zeros or ones with same size of the other

25748

// operands.

25749

if (CmpSz > VT.getSizeInBits())

25750

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

25751

else if (CmpSz < VT.getSizeInBits())

25752

Neg = DAG.getNode(ISD::AND, DL, VT,

25753

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

25754

DAG.getConstant(1, DL, VT));

25755

else

25756

Neg = CmpOp0;

25757

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

25758

Neg); // -(and (x, 0x1))

25759

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

25760

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

25761

}

25762

} else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&

25763

Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

25764

((CondCode == X86::COND_S) || // smin(x, 0)

25765

(CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

25766

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

25767

//

25768

// If the comparison is testing for a positive value, we have to invert

25769

// the sign bit mask, so only do that transform if the target has a

25770

// bitwise 'and not' instruction (the invert is free).

25771

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

25772

unsigned ShCt = VT.getSizeInBits() - 1;

25773

SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

25774

SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

25775

if (CondCode == X86::COND_G)

25776

Shift = DAG.getNOT(DL, Shift, VT);

25777

return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

25778

}

25779

}

25780

25781

// Look past (and (setcc_carry (cmp ...)), 1).

25782

if (Cond.getOpcode() == ISD::AND &&

25783

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

25784

isOneConstant(Cond.getOperand(1)))

25785

Cond = Cond.getOperand(0);

25786

25787

// If condition flag is set by a X86ISD::CMP, then use it as the condition

25788

// setting operand in place of the X86ISD::SETCC.

25789

unsigned CondOpcode = Cond.getOpcode();

25790

if (CondOpcode == X86ISD::SETCC ||

25791

CondOpcode == X86ISD::SETCC_CARRY) {

25792

CC = Cond.getOperand(0);

25793

25794

SDValue Cmp = Cond.getOperand(1);

25795

bool IllegalFPCMov = false;

25796

if (VT.isFloatingPoint() && !VT.isVector() &&

25797

!isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?

25798

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

25799

25800

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

25801

Cmp.getOpcode() == X86ISD::BT) { // FIXME

25802

Cond = Cmp;

25803

AddTest = false;

25804

}

25805

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

25806

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

25807

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

25808

SDValue Value;

25809

X86::CondCode X86Cond;

25810

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

25811

25812

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

25813

AddTest = false;

25814

}

25815

25816

if (AddTest) {

25817

// Look past the truncate if the high bits are known zero.

25818

if (isTruncWithZeroHighBitsInput(Cond, DAG))

25819

Cond = Cond.getOperand(0);

25820

25821

// We know the result of AND is compared against zero. Try to match

25822

// it to BT.

25823

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

25824

X86::CondCode X86CondCode;

25825

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

25826

CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

25827

Cond = BT;

25828

AddTest = false;

25829

}

25830

}

25831

}

25832

25833

if (AddTest) {

25834

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

25835

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

25836

}

25837

25838

// a < b ? -1 : 0 -> RES = ~setcc_carry

25839

// a < b ? 0 : -1 -> RES = setcc_carry

25840

// a >= b ? -1 : 0 -> RES = setcc_carry

25841

// a >= b ? 0 : -1 -> RES = ~setcc_carry

25842

if (Cond.getOpcode() == X86ISD::SUB) {

25843

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

25844

25845

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

25846

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

25847

(isNullConstant(Op1) || isNullConstant(Op2))) {

25848

SDValue Res =

25849

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

25850

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

25851

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

25852

return DAG.getNOT(DL, Res, Res.getValueType());

25853

return Res;

25854

}

25855

}

25856

25857

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

25858

// widen the cmov and push the truncate through. This avoids introducing a new

25859

// branch during isel and doesn't add any extensions.

25860

if (Op.getValueType() == MVT::i8 &&

25861

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

25862

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

25863

if (T1.getValueType() == T2.getValueType() &&

25864

// Exclude CopyFromReg to avoid partial register stalls.

25865

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

25866

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

25867

CC, Cond);

25868

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

25869

}

25870

}

25871

25872

// Or finally, promote i8 cmovs if we have CMOV,

25873

// or i16 cmovs if it won't prevent folding a load.

25874

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

25875

// legal, but EmitLoweredSelect() can not deal with these extensions

25876

// being inserted between two CMOV's. (in i16 case too TBN)

25877

// https://bugs.llvm.org/show_bug.cgi?id=40974

25878

if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

25879

(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

25880

!X86::mayFoldLoad(Op2, Subtarget))) {

25881

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

25882

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

25883

SDValue Ops[] = { Op2, Op1, CC, Cond };

25884

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

25885

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

25886

}

25887

25888

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

25889

// condition is true.

25890

SDValue Ops[] = { Op2, Op1, CC, Cond };

25891

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

25892

}

25893

25894

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

25895

const X86Subtarget &Subtarget,

25896

SelectionDAG &DAG) {

25897

MVT VT = Op->getSimpleValueType(0);

25898

SDValue In = Op->getOperand(0);

25899

MVT InVT = In.getSimpleValueType();

25900

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25900, __extension__
__PRETTY_FUNCTION__));

25901

MVT VTElt = VT.getVectorElementType();

25902

SDLoc dl(Op);

25903

25904

unsigned NumElts = VT.getVectorNumElements();

25905

25906

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

25907

MVT ExtVT = VT;

25908

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

25909

// If v16i32 is to be avoided, we'll need to split and concatenate.

25910

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

25911

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

25912

25913

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

25914

}

25915

25916

// Widen to 512-bits if VLX is not supported.

25917

MVT WideVT = ExtVT;

25918

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

25919

NumElts *= 512 / ExtVT.getSizeInBits();

25920

InVT = MVT::getVectorVT(MVT::i1, NumElts);

25921

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

25922

In, DAG.getIntPtrConstant(0, dl));

25923

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

25924

}

25925

25926

SDValue V;

25927

MVT WideEltVT = WideVT.getVectorElementType();

25928

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

25929

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

25930

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

25931

} else {

25932

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

25933

SDValue Zero = DAG.getConstant(0, dl, WideVT);

25934

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

25935

}

25936

25937

// Truncate if we had to extend i16/i8 above.

25938

if (VT != ExtVT) {

25939

WideVT = MVT::getVectorVT(VTElt, NumElts);

25940

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

25941

}

25942

25943

// Extract back to 128/256-bit if we widened.

25944

if (WideVT != VT)

25945

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

25946

DAG.getIntPtrConstant(0, dl));

25947

25948

return V;

25949

}

25950

25951

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

25952

SelectionDAG &DAG) {

25953

SDValue In = Op->getOperand(0);

25954

MVT InVT = In.getSimpleValueType();

25955

25956

if (InVT.getVectorElementType() == MVT::i1)

25957

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

25958

25959

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25959, __extension__
__PRETTY_FUNCTION__));

25960

return LowerAVXExtend(Op, DAG, Subtarget);

25961

}

25962

25963

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

25964

// For sign extend this needs to handle all vector sizes and SSE4.1 and

25965

// non-SSE4.1 targets. For zero extend this should only handle inputs of

25966

// MVT::v64i8 when BWI is not supported, but AVX512 is.

25967

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

25968

const X86Subtarget &Subtarget,

25969

SelectionDAG &DAG) {

25970

SDValue In = Op->getOperand(0);

25971

MVT VT = Op->getSimpleValueType(0);

25972

MVT InVT = In.getSimpleValueType();

25973

25974

MVT SVT = VT.getVectorElementType();

25975

MVT InSVT = InVT.getVectorElementType();

25976

assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25976, __extension__
__PRETTY_FUNCTION__));

25977

25978

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

25979

return SDValue();

25980

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

25981

return SDValue();

25982

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

25983

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

25984

!(VT.is512BitVector() && Subtarget.hasAVX512()))

25985

return SDValue();

25986

25987

SDLoc dl(Op);

25988

unsigned Opc = Op.getOpcode();

25989

unsigned NumElts = VT.getVectorNumElements();

25990

25991

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

25992

// For 512-bit vectors, we need 128-bits or 256-bits.

25993

if (InVT.getSizeInBits() > 128) {

25994

// Input needs to be at least the same number of elements as output, and

25995

// at least 128-bits.

25996

int InSize = InSVT.getSizeInBits() * NumElts;

25997

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

25998

InVT = In.getSimpleValueType();

25999

}

26000

26001

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

26002

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

26003

// need to be handled here for 256/512-bit results.

26004

if (Subtarget.hasInt256()) {

26005

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__));

26006

26007

if (InVT.getVectorNumElements() != NumElts)

26008

return DAG.getNode(Op.getOpcode(), dl, VT, In);

26009

26010

// FIXME: Apparently we create inreg operations that could be regular

26011

// extends.

26012

unsigned ExtOpc =

26013

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

26014

: ISD::ZERO_EXTEND;

26015

return DAG.getNode(ExtOpc, dl, VT, In);

26016

}

26017

26018

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

26019

if (Subtarget.hasAVX()) {

26020

assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26020, __extension__
__PRETTY_FUNCTION__));

26021

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26022

int HalfNumElts = HalfVT.getVectorNumElements();

26023

26024

unsigned NumSrcElts = InVT.getVectorNumElements();

26025

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

26026

for (int i = 0; i != HalfNumElts; ++i)

26027

HiMask[i] = HalfNumElts + i;

26028

26029

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

26030

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

26031

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

26032

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

26033

}

26034

26035

// We should only get here for sign extend.

26036

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26036, __extension__
__PRETTY_FUNCTION__));

26037

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26037, __extension__
__PRETTY_FUNCTION__));

26038

26039

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

26040

SDValue Curr = In;

26041

SDValue SignExt = Curr;

26042

26043

// As SRAI is only available on i16/i32 types, we expand only up to i32

26044

// and handle i64 separately.

26045

if (InVT != MVT::v4i32) {

26046

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

26047

26048

unsigned DestWidth = DestVT.getScalarSizeInBits();

26049

unsigned Scale = DestWidth / InSVT.getSizeInBits();

26050

26051

unsigned InNumElts = InVT.getVectorNumElements();

26052

unsigned DestElts = DestVT.getVectorNumElements();

26053

26054

// Build a shuffle mask that takes each input element and places it in the

26055

// MSBs of the new element size.

26056

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

26057

for (unsigned i = 0; i != DestElts; ++i)

26058

Mask[i * Scale + (Scale - 1)] = i;

26059

26060

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

26061

Curr = DAG.getBitcast(DestVT, Curr);

26062

26063

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

26064

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

26065

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

26066

}

26067

26068

if (VT == MVT::v2i64) {

26069

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26069, __extension__
__PRETTY_FUNCTION__));

26070

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

26071

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

26072

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

26073

SignExt = DAG.getBitcast(VT, SignExt);

26074

}

26075

26076

return SignExt;

26077

}

26078

26079

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26080

SelectionDAG &DAG) {

26081

MVT VT = Op->getSimpleValueType(0);

26082

SDValue In = Op->getOperand(0);

26083

MVT InVT = In.getSimpleValueType();

26084

SDLoc dl(Op);

26085

26086

if (InVT.getVectorElementType() == MVT::i1)

26087

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26088

26089

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26089, __extension__
__PRETTY_FUNCTION__));

26090

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26091, __extension__
__PRETTY_FUNCTION__))

26091

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26091, __extension__
__PRETTY_FUNCTION__));

26092

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26095, __extension__
__PRETTY_FUNCTION__))

26093

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26095, __extension__
__PRETTY_FUNCTION__))

26094

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26095, __extension__
__PRETTY_FUNCTION__))

26095

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26095, __extension__
__PRETTY_FUNCTION__));

26096

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26099, __extension__
__PRETTY_FUNCTION__))

26097

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26099, __extension__
__PRETTY_FUNCTION__))

26098

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26099, __extension__
__PRETTY_FUNCTION__))

26099

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26099, __extension__
__PRETTY_FUNCTION__));

26100

26101

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

26102

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26102, __extension__
__PRETTY_FUNCTION__));

26103

return splitVectorIntUnary(Op, DAG);

26104

}

26105

26106

if (Subtarget.hasInt256())

26107

return Op;

26108

26109

// Optimize vectors in AVX mode

26110

// Sign extend v8i16 to v8i32 and

26111

// v4i32 to v4i64

26112

//

26113

// Divide input vector into two parts

26114

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

26115

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

26116

// concat the vectors to original VT

26117

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26118

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

26119

26120

unsigned NumElems = InVT.getVectorNumElements();

26121

SmallVector<int,8> ShufMask(NumElems, -1);

26122

for (unsigned i = 0; i != NumElems/2; ++i)

26123

ShufMask[i] = i + NumElems/2;

26124

26125

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

26126

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

26127

26128

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

26129

}

26130

26131

/// Change a vector store into a pair of half-size vector stores.

26132

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

26133

SDValue StoredVal = Store->getValue();

26134

assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26136, __extension__
__PRETTY_FUNCTION__))

26135

StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26136, __extension__
__PRETTY_FUNCTION__))

26136

"Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26136, __extension__
__PRETTY_FUNCTION__));

26137

26138

// Splitting volatile memory ops is not allowed unless the operation was not

26139

// legal to begin with. Assume the input store is legal (this transform is

26140

// only used for targets with AVX). Note: It is possible that we have an

26141

// illegal type like v2i128, and so we could allow splitting a volatile store

26142

// in that case if that is important.

26143

if (!Store->isSimple())

26144

return SDValue();

26145

26146

SDLoc DL(Store);

26147

SDValue Value0, Value1;

26148

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

26149

unsigned HalfOffset = Value0.getValueType().getStoreSize();

26150

SDValue Ptr0 = Store->getBasePtr();

26151

SDValue Ptr1 =

26152

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

26153

SDValue Ch0 =

26154

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

26155

Store->getOriginalAlign(),

26156

Store->getMemOperand()->getFlags());

26157

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

26158

Store->getPointerInfo().getWithOffset(HalfOffset),

26159

Store->getOriginalAlign(),

26160

Store->getMemOperand()->getFlags());

26161

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

26162

}

26163

26164

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

26165

/// type.

26166

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

26167

SelectionDAG &DAG) {

26168

SDValue StoredVal = Store->getValue();

26169

assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26170, __extension__
__PRETTY_FUNCTION__))

26170

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26170, __extension__
__PRETTY_FUNCTION__));

26171

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

26172

26173

// Splitting volatile memory ops is not allowed unless the operation was not

26174

// legal to begin with. We are assuming the input op is legal (this transform

26175

// is only used for targets with AVX).

26176

if (!Store->isSimple())

26177

return SDValue();

26178

26179

MVT StoreSVT = StoreVT.getScalarType();

26180

unsigned NumElems = StoreVT.getVectorNumElements();

26181

unsigned ScalarSize = StoreSVT.getStoreSize();

26182

26183

SDLoc DL(Store);

26184

SmallVector<SDValue, 4> Stores;

26185

for (unsigned i = 0; i != NumElems; ++i) {

26186

unsigned Offset = i * ScalarSize;

26187

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

26188

TypeSize::Fixed(Offset), DL);

26189

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

26190

DAG.getIntPtrConstant(i, DL));

26191

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

26192

Store->getPointerInfo().getWithOffset(Offset),

26193

Store->getOriginalAlign(),

26194

Store->getMemOperand()->getFlags());

26195

Stores.push_back(Ch);

26196

}

26197

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

26198

}

26199

26200

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

26201

SelectionDAG &DAG) {

26202

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

26203

SDLoc dl(St);

26204

SDValue StoredVal = St->getValue();

26205

26206

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

26207

if (StoredVal.getValueType().isVector() &&

26208

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

26209

unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

26210

assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26210, __extension__
__PRETTY_FUNCTION__));

26211

assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26211, __extension__
__PRETTY_FUNCTION__));

26212

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26213, __extension__
__PRETTY_FUNCTION__))

26213

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26213, __extension__
__PRETTY_FUNCTION__));

26214

26215

// We must pad with zeros to ensure we store zeroes to any unused bits.

26216

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

26217

DAG.getUNDEF(MVT::v16i1), StoredVal,

26218

DAG.getIntPtrConstant(0, dl));

26219

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

26220

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

26221

// Make sure we store zeros in the extra bits.

26222

if (NumElts < 8)

26223

StoredVal = DAG.getZeroExtendInReg(

26224

StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));

26225

26226

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26227

St->getPointerInfo(), St->getOriginalAlign(),

26228

St->getMemOperand()->getFlags());

26229

}

26230

26231

if (St->isTruncatingStore())

26232

return SDValue();

26233

26234

// If this is a 256-bit store of concatenated ops, we are better off splitting

26235

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

26236

// and each half can execute independently. Some cores would split the op into

26237

// halves anyway, so the concat (vinsertf128) is purely an extra op.

26238

MVT StoreVT = StoredVal.getSimpleValueType();

26239

if (StoreVT.is256BitVector() ||

26240

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

26241

!Subtarget.hasBWI())) {

26242

SmallVector<SDValue, 4> CatOps;

26243

if (StoredVal.hasOneUse() &&

26244

collectConcatOps(StoredVal.getNode(), CatOps, DAG))

26245

return splitVectorStore(St, DAG);

26246

return SDValue();

26247

}

26248

26249

if (StoreVT.is32BitVector())

26250

return SDValue();

26251

26252

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26253

assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26253, __extension__
__PRETTY_FUNCTION__));

26254

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26256, __extension__
__PRETTY_FUNCTION__))

26255

TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26256, __extension__
__PRETTY_FUNCTION__))

26256

"Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26256, __extension__
__PRETTY_FUNCTION__));

26257

26258

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

26259

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

26260

DAG.getUNDEF(StoreVT));

26261

26262

if (Subtarget.hasSSE2()) {

26263

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

26264

// and store it.

26265

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

26266

MVT CastVT = MVT::getVectorVT(StVT, 2);

26267

StoredVal = DAG.getBitcast(CastVT, StoredVal);

26268

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

26269

DAG.getIntPtrConstant(0, dl));

26270

26271

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26272

St->getPointerInfo(), St->getOriginalAlign(),

26273

St->getMemOperand()->getFlags());

26274

}

26275

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26275, __extension__
__PRETTY_FUNCTION__));

26276

SDVTList Tys = DAG.getVTList(MVT::Other);

26277

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

26278

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

26279

St->getMemOperand());

26280

}

26281

26282

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

26283

// may emit an illegal shuffle but the expansion is still better than scalar

26284

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

26285

// we'll emit a shuffle and a arithmetic shift.

26286

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

26287

// TODO: It is possible to support ZExt by zeroing the undef values during

26288

// the shuffle phase or after the shuffle.

26289

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

26290

SelectionDAG &DAG) {

26291

MVT RegVT = Op.getSimpleValueType();

26292

assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26292, __extension__
__PRETTY_FUNCTION__));

26293

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26294, __extension__
__PRETTY_FUNCTION__))

26294

"We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26294, __extension__
__PRETTY_FUNCTION__));

26295

26296

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

26297

SDLoc dl(Ld);

26298

26299

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

26300

if (RegVT.getVectorElementType() == MVT::i1) {

26301

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26301, __extension__
__PRETTY_FUNCTION__));

26302

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26302, __extension__
__PRETTY_FUNCTION__));

26303

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26304, __extension__
__PRETTY_FUNCTION__))

26304

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26304, __extension__
__PRETTY_FUNCTION__));

26305

26306

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

26307

Ld->getPointerInfo(), Ld->getOriginalAlign(),

26308

Ld->getMemOperand()->getFlags());

26309

26310

// Replace chain users with the new chain.

26311

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26311, __extension__
__PRETTY_FUNCTION__));

26312

26313

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

26314

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

26315

DAG.getBitcast(MVT::v16i1, Val),

26316

DAG.getIntPtrConstant(0, dl));

26317

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

26318

}

26319

26320

return SDValue();

26321

}

26322

26323

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

26324

/// each of which has no other use apart from the AND / OR.

26325

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

26326

Opc = Op.getOpcode();

26327

if (Opc != ISD::OR && Opc != ISD::AND)

26328

return false;

26329

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

26330

Op.getOperand(0).hasOneUse() &&

26331

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

26332

Op.getOperand(1).hasOneUse());

26333

}

26334

26335

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

26336

SDValue Chain = Op.getOperand(0);

26337

SDValue Cond = Op.getOperand(1);

26338

SDValue Dest = Op.getOperand(2);

26339

SDLoc dl(Op);

26340

26341

// Bail out when we don't have native compare instructions.

26342

if (Cond.getOpcode() == ISD::SETCC &&

26343

Cond.getOperand(0).getValueType() != MVT::f128 &&

26344

!isSoftFP16(Cond.getOperand(0).getValueType())) {

26345

SDValue LHS = Cond.getOperand(0);

26346

SDValue RHS = Cond.getOperand(1);

26347

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

26348

26349

// Special case for

26350

// setcc([su]{add,sub,mul}o == 0)

26351

// setcc([su]{add,sub,mul}o != 1)

26352

if (ISD::isOverflowIntrOpRes(LHS) &&

26353

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

26354

(isNullConstant(RHS) || isOneConstant(RHS))) {

26355

SDValue Value, Overflow;

26356

X86::CondCode X86Cond;

26357

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

26358

26359

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

26360

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

26361

26362

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26363

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26364

Overflow);

26365

}

26366

26367

if (LHS.getSimpleValueType().isInteger()) {

26368

SDValue CCVal;

26369

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

26370

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26371

EFLAGS);

26372

}

26373

26374

if (CC == ISD::SETOEQ) {

26375

// For FCMP_OEQ, we can emit

26376

// two branches instead of an explicit AND instruction with a

26377

// separate test. However, we only do this if this block doesn't

26378

// have a fall-through edge, because this requires an explicit

26379

// jmp when the condition is false.

26380

if (Op.getNode()->hasOneUse()) {

26381

SDNode *User = *Op.getNode()->use_begin();

26382

// Look for an unconditional branch following this conditional branch.

26383

// We need this because we need to reverse the successors in order

26384

// to implement FCMP_OEQ.

26385

if (User->getOpcode() == ISD::BR) {

26386

SDValue FalseBB = User->getOperand(1);

26387

SDNode *NewBR =

26388

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

26389

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26389, __extension__ __PRETTY_FUNCTION__));

26390

(void)NewBR;

26391

Dest = FalseBB;

26392

26393

SDValue Cmp =

26394

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26395

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26396

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

26397

CCVal, Cmp);

26398

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26399

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26400

Cmp);

26401

}

26402

}

26403

} else if (CC == ISD::SETUNE) {

26404

// For FCMP_UNE, we can emit

26405

// two branches instead of an explicit OR instruction with a

26406

// separate test.

26407

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26408

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26409

Chain =

26410

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

26411

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26412

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26413

Cmp);

26414

} else {

26415

X86::CondCode X86Cond =

26416

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

26417

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26418

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26419

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26420

Cmp);

26421

}

26422

}

26423

26424

if (ISD::isOverflowIntrOpRes(Cond)) {

26425

SDValue Value, Overflow;

26426

X86::CondCode X86Cond;

26427

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26428

26429

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26430

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26431

Overflow);

26432

}

26433

26434

// Look past the truncate if the high bits are known zero.

26435

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26436

Cond = Cond.getOperand(0);

26437

26438

EVT CondVT = Cond.getValueType();

26439

26440

// Add an AND with 1 if we don't already have one.

26441

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

26442

Cond =

26443

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

26444

26445

SDValue LHS = Cond;

26446

SDValue RHS = DAG.getConstant(0, dl, CondVT);

26447

26448

SDValue CCVal;

26449

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

26450

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26451

EFLAGS);

26452

}

26453

26454

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

26455

// Calls to _alloca are needed to probe the stack when allocating more than 4k

26456

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

26457

// that the guard pages used by the OS virtual memory manager are allocated in

26458

// correct sequence.

26459

SDValue

26460

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

26461

SelectionDAG &DAG) const {

26462

MachineFunction &MF = DAG.getMachineFunction();

26463

bool SplitStack = MF.shouldSplitStack();

26464

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

26465

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

26466

SplitStack || EmitStackProbeCall;

26467

SDLoc dl(Op);

26468

26469

// Get the inputs.

26470

SDNode *Node = Op.getNode();

26471

SDValue Chain = Op.getOperand(0);

26472

SDValue Size = Op.getOperand(1);

26473

MaybeAlign Alignment(Op.getConstantOperandVal(2));

26474

EVT VT = Node->getValueType(0);

26475

26476

// Chain the dynamic stack allocation so that it doesn't modify the stack

26477

// pointer when other instructions are using the stack.

26478

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

26479

26480

bool Is64Bit = Subtarget.is64Bit();

26481

MVT SPTy = getPointerTy(DAG.getDataLayout());

26482

26483

SDValue Result;

26484

if (!Lower) {

26485

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26486

Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

26487

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26488, __extension__
__PRETTY_FUNCTION__))

26488

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26488, __extension__
__PRETTY_FUNCTION__));

26489

26490

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

26491

const Align StackAlign = TFI.getStackAlign();

26492

if (hasInlineStackProbe(MF)) {

26493

MachineRegisterInfo &MRI = MF.getRegInfo();

26494

26495

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

26496

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

26497

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

26498

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

26499

DAG.getRegister(Vreg, SPTy));

26500

} else {

26501

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

26502

Chain = SP.getValue(1);

26503

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

26504

}

26505

if (Alignment && *Alignment > StackAlign)

26506

Result =

26507

DAG.getNode(ISD::AND, dl, VT, Result,

26508

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

26509

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

26510

} else if (SplitStack) {

26511

MachineRegisterInfo &MRI = MF.getRegInfo();

26512

26513

if (Is64Bit) {

26514

// The 64 bit implementation of segmented stacks needs to clobber both r10

26515

// r11. This makes it impossible to use it along with nested parameters.

26516

const Function &F = MF.getFunction();

26517

for (const auto &A : F.args()) {

26518

if (A.hasNestAttr())

26519

report_fatal_error("Cannot use segmented stacks with functions that "

26520

"have nested arguments.");

26521

}

26522

}

26523

26524

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

26525

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

26526

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

26527

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

26528

DAG.getRegister(Vreg, SPTy));

26529

} else {

26530

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

26531

Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

26532

MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);

26533

26534

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26535

Register SPReg = RegInfo->getStackRegister();

26536

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

26537

Chain = SP.getValue(1);

26538

26539

if (Alignment) {

26540

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

26541

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

26542

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

26543

}

26544

26545

Result = SP;

26546

}

26547

26548

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

26549

26550

SDValue Ops[2] = {Result, Chain};

26551

return DAG.getMergeValues(Ops, dl);

26552

}

26553

26554

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

26555

MachineFunction &MF = DAG.getMachineFunction();

26556

auto PtrVT = getPointerTy(MF.getDataLayout());

26557

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

26558

26559

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

26560

SDLoc DL(Op);

26561

26562

if (!Subtarget.is64Bit() ||

26563

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

26564

// vastart just stores the address of the VarArgsFrameIndex slot into the

26565

// memory location argument.

26566

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

26567

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

26568

MachinePointerInfo(SV));

26569

}

26570

26571

// __va_list_tag:

26572

// gp_offset (0 - 6 * 8)

26573

// fp_offset (48 - 48 + 8 * 16)

26574

// overflow_arg_area (point to parameters coming in memory).

26575

// reg_save_area

26576

SmallVector<SDValue, 8> MemOps;

26577

SDValue FIN = Op.getOperand(1);

26578

// Store gp_offset

26579

SDValue Store = DAG.getStore(

26580

Op.getOperand(0), DL,

26581

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

26582

MachinePointerInfo(SV));

26583

MemOps.push_back(Store);

26584

26585

// Store fp_offset

26586

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

26587

Store = DAG.getStore(

26588

Op.getOperand(0), DL,

26589

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

26590

MachinePointerInfo(SV, 4));

26591

MemOps.push_back(Store);

26592

26593

// Store ptr to overflow_arg_area

26594

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

26595

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

26596

Store =

26597

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

26598

MemOps.push_back(Store);

26599

26600

// Store ptr to reg_save_area.

26601

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

26602

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

26603

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

26604

Store = DAG.getStore(

26605

Op.getOperand(0), DL, RSFIN, FIN,

26606

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

26607

MemOps.push_back(Store);

26608

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

26609

}

26610

26611

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

26612

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26613, __extension__
__PRETTY_FUNCTION__))

26613

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26613, __extension__
__PRETTY_FUNCTION__));

26614

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26614, __extension__ __PRETTY_FUNCTION__));

26615

26616

MachineFunction &MF = DAG.getMachineFunction();

26617

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

26618

// The Win64 ABI uses char* instead of a structure.

26619

return DAG.expandVAArg(Op.getNode());

26620

26621

SDValue Chain = Op.getOperand(0);

26622

SDValue SrcPtr = Op.getOperand(1);

26623

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

26624

unsigned Align = Op.getConstantOperandVal(3);

26625

SDLoc dl(Op);

26626

26627

EVT ArgVT = Op.getNode()->getValueType(0);

26628

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

26629

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

26630

uint8_t ArgMode;

26631

26632

// Decide which area this value should be read from.

26633

// TODO: Implement the AMD64 ABI in its entirety. This simple

26634

// selection mechanism works only for the basic types.

26635

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26635, __extension__
__PRETTY_FUNCTION__));

26636

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

26637

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

26638

} else {

26639

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26640, __extension__
__PRETTY_FUNCTION__))

26640

"Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26640, __extension__
__PRETTY_FUNCTION__));

26641

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

26642

}

26643

26644

if (ArgMode == 2) {

26645

// Make sure using fp_offset makes sense.

26646

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__))

26647

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__))

26648

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__));

26649

}

26650

26651

// Insert VAARG node into the DAG

26652

// VAARG returns two values: Variable Argument Address, Chain

26653

SDValue InstOps[] = {Chain, SrcPtr,

26654

DAG.getTargetConstant(ArgSize, dl, MVT::i32),

26655

DAG.getTargetConstant(ArgMode, dl, MVT::i8),

26656

DAG.getTargetConstant(Align, dl, MVT::i32)};

26657

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

26658

SDValue VAARG = DAG.getMemIntrinsicNode(

26659

Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

26660

VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

26661

/*Alignment=*/std::nullopt,

26662

MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

26663

Chain = VAARG.getValue(1);

26664

26665

// Load the next argument and return it

26666

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

26667

}

26668

26669

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

26670

SelectionDAG &DAG) {

26671

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

26672

// where a va_list is still an i8*.

26673

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__
__PRETTY_FUNCTION__));

26674

if (Subtarget.isCallingConvWin64(

26675

DAG.getMachineFunction().getFunction().getCallingConv()))

26676

// Probably a Win64 va_copy.

26677

return DAG.expandVACopy(Op.getNode());

26678

26679

SDValue Chain = Op.getOperand(0);

26680

SDValue DstPtr = Op.getOperand(1);

26681

SDValue SrcPtr = Op.getOperand(2);

26682

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

26683

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

26684

SDLoc DL(Op);

26685

26686

return DAG.getMemcpy(

26687

Chain, DL, DstPtr, SrcPtr,

26688

DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

26689

Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

26690

false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

26691

}

26692

26693

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

26694

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

26695

switch (Opc) {

26696

case ISD::SHL:

26697

case X86ISD::VSHL:

26698

case X86ISD::VSHLI:

26699

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

26700

case ISD::SRL:

26701

case X86ISD::VSRL:

26702

case X86ISD::VSRLI:

26703

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

26704

case ISD::SRA:

26705

case X86ISD::VSRA:

26706

case X86ISD::VSRAI:

26707

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

26708

}

26709

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26709);

26710

}

26711

26712

/// Handle vector element shifts where the shift amount is a constant.

26713

/// Takes immediate version of shift as input.

26714

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

26715

SDValue SrcOp, uint64_t ShiftAmt,

26716

SelectionDAG &DAG) {

26717

MVT ElementType = VT.getVectorElementType();

26718

26719

// Bitcast the source vector to the output type, this is mainly necessary for

26720

// vXi8/vXi64 shifts.

26721

if (VT != SrcOp.getSimpleValueType())

26722

SrcOp = DAG.getBitcast(VT, SrcOp);

26723

26724

// Fold this packed shift into its first operand if ShiftAmt is 0.

26725

if (ShiftAmt == 0)

26726

return SrcOp;

26727

26728

// Check for ShiftAmt >= element width

26729

if (ShiftAmt >= ElementType.getSizeInBits()) {

26730

if (Opc == X86ISD::VSRAI)

26731

ShiftAmt = ElementType.getSizeInBits() - 1;

26732

else

26733

return DAG.getConstant(0, dl, VT);

26734

}

26735

26736

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26737, __extension__
__PRETTY_FUNCTION__))

26737

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26737, __extension__
__PRETTY_FUNCTION__));

26738

26739

// Fold this packed vector shift into a build vector if SrcOp is a

26740

// vector of Constants or UNDEFs.

26741

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

26742

unsigned ShiftOpc;

26743

switch (Opc) {

26744

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26744);

26745

case X86ISD::VSHLI:

26746

ShiftOpc = ISD::SHL;

26747

break;

26748

case X86ISD::VSRLI:

26749

ShiftOpc = ISD::SRL;

26750

break;

26751

case X86ISD::VSRAI:

26752

ShiftOpc = ISD::SRA;

26753

break;

26754

}

26755

26756

SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);

26757

if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))

26758

return C;

26759

}

26760

26761

return DAG.getNode(Opc, dl, VT, SrcOp,

26762

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

26763

}

26764

26765

/// Handle vector element shifts by a splat shift amount

26766

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

26767

SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

26768

const X86Subtarget &Subtarget,

26769

SelectionDAG &DAG) {

26770

MVT AmtVT = ShAmt.getSimpleValueType();

26771

assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26771, __extension__
__PRETTY_FUNCTION__));

26772

assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26773, __extension__
__PRETTY_FUNCTION__))

26773

"Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26773, __extension__
__PRETTY_FUNCTION__));

26774

26775

// Move the splat element to the bottom element.

26776

if (ShAmtIdx != 0) {

26777

SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

26778

Mask[0] = ShAmtIdx;

26779

ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

26780

}

26781

26782

// Peek through any zext node if we can get back to a 128-bit source.

26783

if (AmtVT.getScalarSizeInBits() == 64 &&

26784

(ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

26785

ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

26786

ShAmt.getOperand(0).getValueType().isSimple() &&

26787

ShAmt.getOperand(0).getValueType().is128BitVector()) {

26788

ShAmt = ShAmt.getOperand(0);

26789

AmtVT = ShAmt.getSimpleValueType();

26790

}

26791

26792

// See if we can mask off the upper elements using the existing source node.

26793

// The shift uses the entire lower 64-bits of the amount vector, so no need to

26794

// do this for vXi64 types.

26795

bool IsMasked = false;

26796

if (AmtVT.getScalarSizeInBits() < 64) {

26797

if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

26798

ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

26799

// If the shift amount has come from a scalar, then zero-extend the scalar

26800

// before moving to the vector.

26801

ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

26802

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

26803

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

26804

AmtVT = MVT::v4i32;

26805

IsMasked = true;

26806

} else if (ShAmt.getOpcode() == ISD::AND) {

26807

// See if the shift amount is already masked (e.g. for rotation modulo),

26808

// then we can zero-extend it by setting all the other mask elements to

26809

// zero.

26810

SmallVector<SDValue> MaskElts(

26811

AmtVT.getVectorNumElements(),

26812

DAG.getConstant(0, dl, AmtVT.getScalarType()));

26813

MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

26814

SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

26815

if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

26816

{ShAmt.getOperand(1), Mask}))) {

26817

ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

26818

IsMasked = true;

26819

}

26820

}

26821

}

26822

26823

// Extract if the shift amount vector is larger than 128-bits.

26824

if (AmtVT.getSizeInBits() > 128) {

26825

ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

26826

AmtVT = ShAmt.getSimpleValueType();

26827

}

26828

26829

// Zero-extend bottom element to v2i64 vector type, either by extension or

26830

// shuffle masking.

26831

if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

26832

if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

26833

ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

26834

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

26835

} else if (Subtarget.hasSSE41()) {

26836

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

26837

MVT::v2i64, ShAmt);

26838

} else {

26839

SDValue ByteShift = DAG.getTargetConstant(

26840

(128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

26841

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

26842

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

26843

ByteShift);

26844

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

26845

ByteShift);

26846

}

26847

}

26848

26849

// Change opcode to non-immediate version.

26850

Opc = getTargetVShiftUniformOpcode(Opc, true);

26851

26852

// The return type has to be a 128-bit type with the same element

26853

// type as the input type.

26854

MVT EltVT = VT.getVectorElementType();

26855

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

26856

26857

ShAmt = DAG.getBitcast(ShVT, ShAmt);

26858

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

26859

}

26860

26861

/// Return Mask with the necessary casting or extending

26862

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

26863

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

26864

const X86Subtarget &Subtarget, SelectionDAG &DAG,

26865

const SDLoc &dl) {

26866

26867

if (isAllOnesConstant(Mask))

26868

return DAG.getConstant(1, dl, MaskVT);

26869

if (X86::isZeroNode(Mask))

26870

return DAG.getConstant(0, dl, MaskVT);

26871

26872

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26872, __extension__
__PRETTY_FUNCTION__));

26873

26874

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

26875

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26875, __extension__
__PRETTY_FUNCTION__));

26876

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26876, __extension__
__PRETTY_FUNCTION__));

26877

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

26878

SDValue Lo, Hi;

26879

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

26880

DAG.getConstant(0, dl, MVT::i32));

26881

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

26882

DAG.getConstant(1, dl, MVT::i32));

26883

26884

Lo = DAG.getBitcast(MVT::v32i1, Lo);

26885

Hi = DAG.getBitcast(MVT::v32i1, Hi);

26886

26887

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

26888

} else {

26889

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

26890

Mask.getSimpleValueType().getSizeInBits());

26891

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

26892

// are extracted by EXTRACT_SUBVECTOR.

26893

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

26894

DAG.getBitcast(BitcastVT, Mask),

26895

DAG.getIntPtrConstant(0, dl));

26896

}

26897

}

26898

26899

/// Return (and \p Op, \p Mask) for compare instructions or

26900

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

26901

/// necessary casting or extending for \p Mask when lowering masking intrinsics

26902

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

26903

SDValue PreservedSrc,

26904

const X86Subtarget &Subtarget,

26905

SelectionDAG &DAG) {

26906

MVT VT = Op.getSimpleValueType();

26907

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

26908

unsigned OpcodeSelect = ISD::VSELECT;

26909

SDLoc dl(Op);

26910

26911

if (isAllOnesConstant(Mask))

26912

return Op;

26913

26914

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

26915

26916

if (PreservedSrc.isUndef())

26917

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

26918

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

26919

}

26920

26921

/// Creates an SDNode for a predicated scalar operation.

26922

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

26923

/// The mask is coming as MVT::i8 and it should be transformed

26924

/// to MVT::v1i1 while lowering masking intrinsics.

26925

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

26926

/// "X86select" instead of "vselect". We just can't create the "vselect" node

26927

/// for a scalar instruction.

26928

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

26929

SDValue PreservedSrc,

26930

const X86Subtarget &Subtarget,

26931

SelectionDAG &DAG) {

26932

26933

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

26934

if (MaskConst->getZExtValue() & 0x1)

26935

return Op;

26936

26937

MVT VT = Op.getSimpleValueType();

26938

SDLoc dl(Op);

26939

26940

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26940, __extension__
__PRETTY_FUNCTION__));

26941

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

26942

DAG.getBitcast(MVT::v8i1, Mask),

26943

DAG.getIntPtrConstant(0, dl));

26944

if (Op.getOpcode() == X86ISD::FSETCCM ||

26945

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

26946

Op.getOpcode() == X86ISD::VFPCLASSS)

26947

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

26948

26949

if (PreservedSrc.isUndef())

26950

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

26951

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

26952

}

26953

26954

static int getSEHRegistrationNodeSize(const Function *Fn) {

26955

if (!Fn->hasPersonalityFn())

26956

report_fatal_error(

26957

"querying registration node size for function without personality");

26958

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

26959

// WinEHStatePass for the full struct definition.

26960

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

26961

case EHPersonality::MSVC_X86SEH: return 24;

26962

case EHPersonality::MSVC_CXX: return 16;

26963

default: break;

26964

}

26965

report_fatal_error(

26966

"can only recover FP for 32-bit MSVC EH personality functions");

26967

}

26968

26969

/// When the MSVC runtime transfers control to us, either to an outlined

26970

/// function or when returning to a parent frame after catching an exception, we

26971

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

26972

/// Here's the math:

26973

/// RegNodeBase = EntryEBP - RegNodeSize

26974

/// ParentFP = RegNodeBase - ParentFrameOffset

26975

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

26976

/// subtracting the offset (negative on x86) takes us back to the parent FP.

26977

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

26978

SDValue EntryEBP) {

26979

MachineFunction &MF = DAG.getMachineFunction();

26980

SDLoc dl;

26981

26982

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26983

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

26984

26985

// It's possible that the parent function no longer has a personality function

26986

// if the exceptional code was optimized away, in which case we just return

26987

// the incoming EBP.

26988

if (!Fn->hasPersonalityFn())

26989

return EntryEBP;

26990

26991

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

26992

// registration, or the .set_setframe offset.

26993

MCSymbol *OffsetSym =

26994

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

26995

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

26996

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

26997

SDValue ParentFrameOffset =

26998

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

26999

27000

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

27001

// prologue to RBP in the parent function.

27002

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

27003

if (Subtarget.is64Bit())

27004

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

27005

27006

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

27007

// RegNodeBase = EntryEBP - RegNodeSize

27008

// ParentFP = RegNodeBase - ParentFrameOffset

27009

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

27010

DAG.getConstant(RegNodeSize, dl, PtrVT));

27011

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

27012

}

27013

27014

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

27015

SelectionDAG &DAG) const {

27016

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

27017

auto isRoundModeCurDirection = [](SDValue Rnd) {

27018

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

27019

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

27020

27021

return false;

27022

};

27023

auto isRoundModeSAE = [](SDValue Rnd) {

27024

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27025

unsigned RC = C->getZExtValue();

27026

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27027

// Clear the NO_EXC bit and check remaining bits.

27028

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27029

// As a convenience we allow no other bits or explicitly

27030

// current direction.

27031

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

27032

}

27033

}

27034

27035

return false;

27036

};

27037

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

27038

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27039

RC = C->getZExtValue();

27040

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27041

// Clear the NO_EXC bit and check remaining bits.

27042

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27043

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

27044

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

27045

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

27046

RC == X86::STATIC_ROUNDING::TO_ZERO;

27047

}

27048

}

27049

27050

return false;

27051

};

27052

27053

SDLoc dl(Op);

27054

unsigned IntNo = Op.getConstantOperandVal(0);

27055

MVT VT = Op.getSimpleValueType();

27056

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

27057

27058

// Propagate flags from original node to transformed node(s).

27059

SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());

27060

27061

if (IntrData) {

27062

switch(IntrData->Type) {

27063

case INTR_TYPE_1OP: {

27064

// We specify 2 possible opcodes for intrinsics with rounding modes.

27065

// First, we check if the intrinsic may have non-default rounding mode,

27066

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27067

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27068

if (IntrWithRoundingModeOpcode != 0) {

27069

SDValue Rnd = Op.getOperand(2);

27070

unsigned RC = 0;

27071

if (isRoundModeSAEToX(Rnd, RC))

27072

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27073

Op.getOperand(1),

27074

DAG.getTargetConstant(RC, dl, MVT::i32));

27075

if (!isRoundModeCurDirection(Rnd))

27076

return SDValue();

27077

}

27078

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27079

Op.getOperand(1));

27080

}

27081

case INTR_TYPE_1OP_SAE: {

27082

SDValue Sae = Op.getOperand(2);

27083

27084

unsigned Opc;

27085

if (isRoundModeCurDirection(Sae))

27086

Opc = IntrData->Opc0;

27087

else if (isRoundModeSAE(Sae))

27088

Opc = IntrData->Opc1;

27089

else

27090

return SDValue();

27091

27092

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

27093

}

27094

case INTR_TYPE_2OP: {

27095

SDValue Src2 = Op.getOperand(2);

27096

27097

// We specify 2 possible opcodes for intrinsics with rounding modes.

27098

// First, we check if the intrinsic may have non-default rounding mode,

27099

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27100

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27101

if (IntrWithRoundingModeOpcode != 0) {

27102

SDValue Rnd = Op.getOperand(3);

27103

unsigned RC = 0;

27104

if (isRoundModeSAEToX(Rnd, RC))

27105

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27106

Op.getOperand(1), Src2,

27107

DAG.getTargetConstant(RC, dl, MVT::i32));

27108

if (!isRoundModeCurDirection(Rnd))

27109

return SDValue();

27110

}

27111

27112

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27113

Op.getOperand(1), Src2);

27114

}

27115

case INTR_TYPE_2OP_SAE: {

27116

SDValue Sae = Op.getOperand(3);

27117

27118

unsigned Opc;

27119

if (isRoundModeCurDirection(Sae))

27120

Opc = IntrData->Opc0;

27121

else if (isRoundModeSAE(Sae))

27122

Opc = IntrData->Opc1;

27123

else

27124

return SDValue();

27125

27126

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

27127

Op.getOperand(2));

27128

}

27129

case INTR_TYPE_3OP:

27130

case INTR_TYPE_3OP_IMM8: {

27131

SDValue Src1 = Op.getOperand(1);

27132

SDValue Src2 = Op.getOperand(2);

27133

SDValue Src3 = Op.getOperand(3);

27134

27135

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

27136

Src3.getValueType() != MVT::i8) {

27137

Src3 = DAG.getTargetConstant(

27138

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

27139

}

27140

27141

// We specify 2 possible opcodes for intrinsics with rounding modes.

27142

// First, we check if the intrinsic may have non-default rounding mode,

27143

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27144

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27145

if (IntrWithRoundingModeOpcode != 0) {

27146

SDValue Rnd = Op.getOperand(4);

27147

unsigned RC = 0;

27148

if (isRoundModeSAEToX(Rnd, RC))

27149

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27150

Src1, Src2, Src3,

27151

DAG.getTargetConstant(RC, dl, MVT::i32));

27152

if (!isRoundModeCurDirection(Rnd))

27153

return SDValue();

27154

}

27155

27156

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27157

{Src1, Src2, Src3});

27158

}

27159

case INTR_TYPE_4OP_IMM8: {

27160

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27160, __extension__
__PRETTY_FUNCTION__));

27161

SDValue Src4 = Op.getOperand(4);

27162

if (Src4.getValueType() != MVT::i8) {

27163

Src4 = DAG.getTargetConstant(

27164

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

27165

}

27166

27167

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27168

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

27169

Src4);

27170

}

27171

case INTR_TYPE_1OP_MASK: {

27172

SDValue Src = Op.getOperand(1);

27173

SDValue PassThru = Op.getOperand(2);

27174

SDValue Mask = Op.getOperand(3);

27175

// We add rounding mode to the Node when

27176

// - RC Opcode is specified and

27177

// - RC is not "current direction".

27178

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27179

if (IntrWithRoundingModeOpcode != 0) {

27180

SDValue Rnd = Op.getOperand(4);

27181

unsigned RC = 0;

27182

if (isRoundModeSAEToX(Rnd, RC))

27183

return getVectorMaskingNode(

27184

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27185

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

27186

Mask, PassThru, Subtarget, DAG);

27187

if (!isRoundModeCurDirection(Rnd))

27188

return SDValue();

27189

}

27190

return getVectorMaskingNode(

27191

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

27192

Subtarget, DAG);

27193

}

27194

case INTR_TYPE_1OP_MASK_SAE: {

27195

SDValue Src = Op.getOperand(1);

27196

SDValue PassThru = Op.getOperand(2);

27197

SDValue Mask = Op.getOperand(3);

27198

SDValue Rnd = Op.getOperand(4);

27199

27200

unsigned Opc;

27201

if (isRoundModeCurDirection(Rnd))

27202

Opc = IntrData->Opc0;

27203

else if (isRoundModeSAE(Rnd))

27204

Opc = IntrData->Opc1;

27205

else

27206

return SDValue();

27207

27208

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

27209

Subtarget, DAG);

27210

}

27211

case INTR_TYPE_SCALAR_MASK: {

27212

SDValue Src1 = Op.getOperand(1);

27213

SDValue Src2 = Op.getOperand(2);

27214

SDValue passThru = Op.getOperand(3);

27215

SDValue Mask = Op.getOperand(4);

27216

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27217

// There are 2 kinds of intrinsics in this group:

27218

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

27219

// (2) With rounding mode and sae - 7 operands.

27220

bool HasRounding = IntrWithRoundingModeOpcode != 0;

27221

if (Op.getNumOperands() == (5U + HasRounding)) {

27222

if (HasRounding) {

27223

SDValue Rnd = Op.getOperand(5);

27224

unsigned RC = 0;

27225

if (isRoundModeSAEToX(Rnd, RC))

27226

return getScalarMaskingNode(

27227

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

27228

DAG.getTargetConstant(RC, dl, MVT::i32)),

27229

Mask, passThru, Subtarget, DAG);

27230

if (!isRoundModeCurDirection(Rnd))

27231

return SDValue();

27232

}

27233

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

27234

Src2),

27235

Mask, passThru, Subtarget, DAG);

27236

}

27237

27238

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27239, __extension__
__PRETTY_FUNCTION__))

27239

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27239, __extension__
__PRETTY_FUNCTION__));

27240

SDValue RoundingMode = Op.getOperand(5);

27241

unsigned Opc = IntrData->Opc0;

27242

if (HasRounding) {

27243

SDValue Sae = Op.getOperand(6);

27244

if (isRoundModeSAE(Sae))

27245

Opc = IntrWithRoundingModeOpcode;

27246

else if (!isRoundModeCurDirection(Sae))

27247

return SDValue();

27248

}

27249

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

27250

Src2, RoundingMode),

27251

Mask, passThru, Subtarget, DAG);

27252

}

27253

case INTR_TYPE_SCALAR_MASK_RND: {

27254

SDValue Src1 = Op.getOperand(1);

27255

SDValue Src2 = Op.getOperand(2);

27256

SDValue passThru = Op.getOperand(3);

27257

SDValue Mask = Op.getOperand(4);

27258

SDValue Rnd = Op.getOperand(5);

27259

27260

SDValue NewOp;

27261

unsigned RC = 0;

27262

if (isRoundModeCurDirection(Rnd))

27263

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27264

else if (isRoundModeSAEToX(Rnd, RC))

27265

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27266

DAG.getTargetConstant(RC, dl, MVT::i32));

27267

else

27268

return SDValue();

27269

27270

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

27271

}

27272

case INTR_TYPE_SCALAR_MASK_SAE: {

27273

SDValue Src1 = Op.getOperand(1);

27274

SDValue Src2 = Op.getOperand(2);

27275

SDValue passThru = Op.getOperand(3);

27276

SDValue Mask = Op.getOperand(4);

27277

SDValue Sae = Op.getOperand(5);

27278

unsigned Opc;

27279

if (isRoundModeCurDirection(Sae))

27280

Opc = IntrData->Opc0;

27281

else if (isRoundModeSAE(Sae))

27282

Opc = IntrData->Opc1;

27283

else

27284

return SDValue();

27285

27286

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27287

Mask, passThru, Subtarget, DAG);

27288

}

27289

case INTR_TYPE_2OP_MASK: {

27290

SDValue Src1 = Op.getOperand(1);

27291

SDValue Src2 = Op.getOperand(2);

27292

SDValue PassThru = Op.getOperand(3);

27293

SDValue Mask = Op.getOperand(4);

27294

SDValue NewOp;

27295

if (IntrData->Opc1 != 0) {

27296

SDValue Rnd = Op.getOperand(5);

27297

unsigned RC = 0;

27298

if (isRoundModeSAEToX(Rnd, RC))

27299

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27300

DAG.getTargetConstant(RC, dl, MVT::i32));

27301

else if (!isRoundModeCurDirection(Rnd))

27302

return SDValue();

27303

}

27304

if (!NewOp)

27305

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27306

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27307

}

27308

case INTR_TYPE_2OP_MASK_SAE: {

27309

SDValue Src1 = Op.getOperand(1);

27310

SDValue Src2 = Op.getOperand(2);

27311

SDValue PassThru = Op.getOperand(3);

27312

SDValue Mask = Op.getOperand(4);

27313

27314

unsigned Opc = IntrData->Opc0;

27315

if (IntrData->Opc1 != 0) {

27316

SDValue Sae = Op.getOperand(5);

27317

if (isRoundModeSAE(Sae))

27318

Opc = IntrData->Opc1;

27319

else if (!isRoundModeCurDirection(Sae))

27320

return SDValue();

27321

}

27322

27323

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27324

Mask, PassThru, Subtarget, DAG);

27325

}

27326

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

27327

SDValue Src1 = Op.getOperand(1);

27328

SDValue Src2 = Op.getOperand(2);

27329

SDValue Src3 = Op.getOperand(3);

27330

SDValue PassThru = Op.getOperand(4);

27331

SDValue Mask = Op.getOperand(5);

27332

SDValue Sae = Op.getOperand(6);

27333

unsigned Opc;

27334

if (isRoundModeCurDirection(Sae))

27335

Opc = IntrData->Opc0;

27336

else if (isRoundModeSAE(Sae))

27337

Opc = IntrData->Opc1;

27338

else

27339

return SDValue();

27340

27341

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27342

Mask, PassThru, Subtarget, DAG);

27343

}

27344

case INTR_TYPE_3OP_MASK_SAE: {

27345

SDValue Src1 = Op.getOperand(1);

27346

SDValue Src2 = Op.getOperand(2);

27347

SDValue Src3 = Op.getOperand(3);

27348

SDValue PassThru = Op.getOperand(4);

27349

SDValue Mask = Op.getOperand(5);

27350

27351

unsigned Opc = IntrData->Opc0;

27352

if (IntrData->Opc1 != 0) {

27353

SDValue Sae = Op.getOperand(6);

27354

if (isRoundModeSAE(Sae))

27355

Opc = IntrData->Opc1;

27356

else if (!isRoundModeCurDirection(Sae))

27357

return SDValue();

27358

}

27359

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27360

Mask, PassThru, Subtarget, DAG);

27361

}

27362

case BLENDV: {

27363

SDValue Src1 = Op.getOperand(1);

27364

SDValue Src2 = Op.getOperand(2);

27365

SDValue Src3 = Op.getOperand(3);

27366

27367

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

27368

Src3 = DAG.getBitcast(MaskVT, Src3);

27369

27370

// Reverse the operands to match VSELECT order.

27371

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

27372

}

27373

case VPERM_2OP : {

27374

SDValue Src1 = Op.getOperand(1);

27375

SDValue Src2 = Op.getOperand(2);

27376

27377

// Swap Src1 and Src2 in the node creation

27378

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

27379

}

27380

case CFMA_OP_MASKZ:

27381

case CFMA_OP_MASK: {

27382

SDValue Src1 = Op.getOperand(1);

27383

SDValue Src2 = Op.getOperand(2);

27384

SDValue Src3 = Op.getOperand(3);

27385

SDValue Mask = Op.getOperand(4);

27386

MVT VT = Op.getSimpleValueType();

27387

27388

SDValue PassThru = Src3;

27389

if (IntrData->Type == CFMA_OP_MASKZ)

27390

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27391

27392

// We add rounding mode to the Node when

27393

// - RC Opcode is specified and

27394

// - RC is not "current direction".

27395

SDValue NewOp;

27396

if (IntrData->Opc1 != 0) {

27397

SDValue Rnd = Op.getOperand(5);

27398

unsigned RC = 0;

27399

if (isRoundModeSAEToX(Rnd, RC))

27400

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

27401

DAG.getTargetConstant(RC, dl, MVT::i32));

27402

else if (!isRoundModeCurDirection(Rnd))

27403

return SDValue();

27404

}

27405

if (!NewOp)

27406

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

27407

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27408

}

27409

case IFMA_OP:

27410

// NOTE: We need to swizzle the operands to pass the multiply operands

27411

// first.

27412

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27413

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

27414

case FPCLASSS: {

27415

SDValue Src1 = Op.getOperand(1);

27416

SDValue Imm = Op.getOperand(2);

27417

SDValue Mask = Op.getOperand(3);

27418

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

27419

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

27420

Subtarget, DAG);

27421

// Need to fill with zeros to ensure the bitcast will produce zeroes

27422

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27423

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27424

DAG.getConstant(0, dl, MVT::v8i1),

27425

FPclassMask, DAG.getIntPtrConstant(0, dl));

27426

return DAG.getBitcast(MVT::i8, Ins);

27427

}

27428

27429

case CMP_MASK_CC: {

27430

MVT MaskVT = Op.getSimpleValueType();

27431

SDValue CC = Op.getOperand(3);

27432

SDValue Mask = Op.getOperand(4);

27433

// We specify 2 possible opcodes for intrinsics with rounding modes.

27434

// First, we check if the intrinsic may have non-default rounding mode,

27435

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27436

if (IntrData->Opc1 != 0) {

27437

SDValue Sae = Op.getOperand(5);

27438

if (isRoundModeSAE(Sae))

27439

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

27440

Op.getOperand(2), CC, Mask, Sae);

27441

if (!isRoundModeCurDirection(Sae))

27442

return SDValue();

27443

}

27444

//default rounding mode

27445

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

27446

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

27447

}

27448

case CMP_MASK_SCALAR_CC: {

27449

SDValue Src1 = Op.getOperand(1);

27450

SDValue Src2 = Op.getOperand(2);

27451

SDValue CC = Op.getOperand(3);

27452

SDValue Mask = Op.getOperand(4);

27453

27454

SDValue Cmp;

27455

if (IntrData->Opc1 != 0) {

27456

SDValue Sae = Op.getOperand(5);

27457

if (isRoundModeSAE(Sae))

27458

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

27459

else if (!isRoundModeCurDirection(Sae))

27460

return SDValue();

27461

}

27462

//default rounding mode

27463

if (!Cmp.getNode())

27464

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

27465

27466

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

27467

Subtarget, DAG);

27468

// Need to fill with zeros to ensure the bitcast will produce zeroes

27469

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27470

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27471

DAG.getConstant(0, dl, MVT::v8i1),

27472

CmpMask, DAG.getIntPtrConstant(0, dl));

27473

return DAG.getBitcast(MVT::i8, Ins);

27474

}

27475

case COMI: { // Comparison intrinsics

27476

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

27477

SDValue LHS = Op.getOperand(1);

27478

SDValue RHS = Op.getOperand(2);

27479

// Some conditions require the operands to be swapped.

27480

if (CC == ISD::SETLT || CC == ISD::SETLE)

27481

std::swap(LHS, RHS);

27482

27483

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

27484

SDValue SetCC;

27485

switch (CC) {

27486

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

27487

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

27488

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

27489

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

27490

break;

27491

}

27492

case ISD::SETNE: { // (ZF = 1 or PF = 1)

27493

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

27494

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

27495

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

27496

break;

27497

}

27498

case ISD::SETGT: // (CF = 0 and ZF = 0)

27499

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

27500

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

27501

break;

27502

}

27503

case ISD::SETGE: // CF = 0

27504

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

27505

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

27506

break;

27507

default:

27508

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27508);

27509

}

27510

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27511

}

27512

case COMI_RM: { // Comparison intrinsics with Sae

27513

SDValue LHS = Op.getOperand(1);

27514

SDValue RHS = Op.getOperand(2);

27515

unsigned CondVal = Op.getConstantOperandVal(3);

27516

SDValue Sae = Op.getOperand(4);

27517

27518

SDValue FCmp;

27519

if (isRoundModeCurDirection(Sae))

27520

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

27521

DAG.getTargetConstant(CondVal, dl, MVT::i8));

27522

else if (isRoundModeSAE(Sae))

27523

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

27524

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

27525

else

27526

return SDValue();

27527

// Need to fill with zeros to ensure the bitcast will produce zeroes

27528

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27529

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

27530

DAG.getConstant(0, dl, MVT::v16i1),

27531

FCmp, DAG.getIntPtrConstant(0, dl));

27532

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

27533

DAG.getBitcast(MVT::i16, Ins));

27534

}

27535

case VSHIFT: {

27536

SDValue SrcOp = Op.getOperand(1);

27537

SDValue ShAmt = Op.getOperand(2);

27538

assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27539, __extension__
__PRETTY_FUNCTION__))

27539

"Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27539, __extension__
__PRETTY_FUNCTION__));

27540

27541

// Catch shift-by-constant.

27542

if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

27543

return getTargetVShiftByConstNode(IntrData->Opc0, dl,

27544

Op.getSimpleValueType(), SrcOp,

27545

CShAmt->getZExtValue(), DAG);

27546

27547

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

27548

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

27549

SrcOp, ShAmt, 0, Subtarget, DAG);

27550

}

27551

case COMPRESS_EXPAND_IN_REG: {

27552

SDValue Mask = Op.getOperand(3);

27553

SDValue DataToCompress = Op.getOperand(1);

27554

SDValue PassThru = Op.getOperand(2);

27555

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

27556

return Op.getOperand(1);

27557

27558

// Avoid false dependency.

27559

if (PassThru.isUndef())

27560

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27561

27562

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

27563

Mask);

27564

}

27565

case FIXUPIMM:

27566

case FIXUPIMM_MASKZ: {

27567

SDValue Src1 = Op.getOperand(1);

27568

SDValue Src2 = Op.getOperand(2);

27569

SDValue Src3 = Op.getOperand(3);

27570

SDValue Imm = Op.getOperand(4);

27571

SDValue Mask = Op.getOperand(5);

27572

SDValue Passthru = (IntrData->Type == FIXUPIMM)

27573

? Src1

27574

: getZeroVector(VT, Subtarget, DAG, dl);

27575

27576

unsigned Opc = IntrData->Opc0;

27577

if (IntrData->Opc1 != 0) {

27578

SDValue Sae = Op.getOperand(6);

27579

if (isRoundModeSAE(Sae))

27580

Opc = IntrData->Opc1;

27581

else if (!isRoundModeCurDirection(Sae))

27582

return SDValue();

27583

}

27584

27585

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

27586

27587

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

27588

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

27589

27590

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

27591

}

27592

case ROUNDP: {

27593

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27593, __extension__
__PRETTY_FUNCTION__));

27594

// Clear the upper bits of the rounding immediate so that the legacy

27595

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

27596

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

27597

SDValue RoundingMode =

27598

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

27599

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27600

Op.getOperand(1), RoundingMode);

27601

}

27602

case ROUNDS: {

27603

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27603, __extension__
__PRETTY_FUNCTION__));

27604

// Clear the upper bits of the rounding immediate so that the legacy

27605

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

27606

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

27607

SDValue RoundingMode =

27608

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

27609

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27610

Op.getOperand(1), Op.getOperand(2), RoundingMode);

27611

}

27612

case BEXTRI: {

27613

assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27613, __extension__
__PRETTY_FUNCTION__));

27614

27615

uint64_t Imm = Op.getConstantOperandVal(2);

27616

SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

27617

Op.getValueType());

27618

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27619

Op.getOperand(1), Control);

27620

}

27621

// ADC/ADCX/SBB

27622

case ADX: {

27623

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

27624

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

27625

27626

SDValue Res;

27627

// If the carry in is zero, then we should just use ADD/SUB instead of

27628

// ADC/SBB.

27629

if (isNullConstant(Op.getOperand(1))) {

27630

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

27631

Op.getOperand(3));

27632

} else {

27633

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

27634

DAG.getConstant(-1, dl, MVT::i8));

27635

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

27636

Op.getOperand(3), GenCF.getValue(1));

27637

}

27638

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

27639

SDValue Results[] = { SetCC, Res };

27640

return DAG.getMergeValues(Results, dl);

27641

}

27642

case CVTPD2PS_MASK:

27643

case CVTPD2DQ_MASK:

27644

case CVTQQ2PS_MASK:

27645

case TRUNCATE_TO_REG: {

27646

SDValue Src = Op.getOperand(1);

27647

SDValue PassThru = Op.getOperand(2);

27648

SDValue Mask = Op.getOperand(3);

27649

27650

if (isAllOnesConstant(Mask))

27651

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

27652

27653

MVT SrcVT = Src.getSimpleValueType();

27654

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

27655

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27656

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

27657

{Src, PassThru, Mask});

27658

}

27659

case CVTPS2PH_MASK: {

27660

SDValue Src = Op.getOperand(1);

27661

SDValue Rnd = Op.getOperand(2);

27662

SDValue PassThru = Op.getOperand(3);

27663

SDValue Mask = Op.getOperand(4);

27664

27665

unsigned RC = 0;

27666

unsigned Opc = IntrData->Opc0;

27667

bool SAE = Src.getValueType().is512BitVector() &&

27668

(isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));

27669

if (SAE) {

27670

Opc = X86ISD::CVTPS2PH_SAE;

27671

Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);

27672

}

27673

27674

if (isAllOnesConstant(Mask))

27675

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);

27676

27677

if (SAE)

27678

Opc = X86ISD::MCVTPS2PH_SAE;

27679

else

27680

Opc = IntrData->Opc1;

27681

MVT SrcVT = Src.getSimpleValueType();

27682

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

27683

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27684

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);

27685

}

27686

case CVTNEPS2BF16_MASK: {

27687

SDValue Src = Op.getOperand(1);

27688

SDValue PassThru = Op.getOperand(2);

27689

SDValue Mask = Op.getOperand(3);

27690

27691

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

27692

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

27693

27694

// Break false dependency.

27695

if (PassThru.isUndef())

27696

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

27697

27698

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

27699

Mask);

27700

}

27701

default:

27702

break;

27703

}

27704

}

27705

27706

switch (IntNo) {

27707

default: return SDValue(); // Don't custom lower most intrinsics.

27708

27709

// ptest and testp intrinsics. The intrinsic these come from are designed to

27710

// return an integer value, not just an instruction so lower it to the ptest

27711

// or testp pattern and a setcc for the result.

27712

case Intrinsic::x86_avx512_ktestc_b:

27713

case Intrinsic::x86_avx512_ktestc_w:

27714

case Intrinsic::x86_avx512_ktestc_d:

27715

case Intrinsic::x86_avx512_ktestc_q:

27716

case Intrinsic::x86_avx512_ktestz_b:

27717

case Intrinsic::x86_avx512_ktestz_w:

27718

case Intrinsic::x86_avx512_ktestz_d:

27719

case Intrinsic::x86_avx512_ktestz_q:

27720

case Intrinsic::x86_sse41_ptestz:

27721

case Intrinsic::x86_sse41_ptestc:

27722

case Intrinsic::x86_sse41_ptestnzc:

27723

case Intrinsic::x86_avx_ptestz_256:

27724

case Intrinsic::x86_avx_ptestc_256:

27725

case Intrinsic::x86_avx_ptestnzc_256:

27726

case Intrinsic::x86_avx_vtestz_ps:

27727

case Intrinsic::x86_avx_vtestc_ps:

27728

case Intrinsic::x86_avx_vtestnzc_ps:

27729

case Intrinsic::x86_avx_vtestz_pd:

27730

case Intrinsic::x86_avx_vtestc_pd:

27731

case Intrinsic::x86_avx_vtestnzc_pd:

27732

case Intrinsic::x86_avx_vtestz_ps_256:

27733

case Intrinsic::x86_avx_vtestc_ps_256:

27734

case Intrinsic::x86_avx_vtestnzc_ps_256:

27735

case Intrinsic::x86_avx_vtestz_pd_256:

27736

case Intrinsic::x86_avx_vtestc_pd_256:

27737

case Intrinsic::x86_avx_vtestnzc_pd_256: {

27738

unsigned TestOpc = X86ISD::PTEST;

27739

X86::CondCode X86CC;

27740

switch (IntNo) {

27741

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27741);

27742

case Intrinsic::x86_avx512_ktestc_b:

27743

case Intrinsic::x86_avx512_ktestc_w:

27744

case Intrinsic::x86_avx512_ktestc_d:

27745

case Intrinsic::x86_avx512_ktestc_q:

27746

// CF = 1

27747

TestOpc = X86ISD::KTEST;

27748

X86CC = X86::COND_B;

27749

break;

27750

case Intrinsic::x86_avx512_ktestz_b:

27751

case Intrinsic::x86_avx512_ktestz_w:

27752

case Intrinsic::x86_avx512_ktestz_d:

27753

case Intrinsic::x86_avx512_ktestz_q:

27754

TestOpc = X86ISD::KTEST;

27755

X86CC = X86::COND_E;

27756

break;

27757

case Intrinsic::x86_avx_vtestz_ps:

27758

case Intrinsic::x86_avx_vtestz_pd:

27759

case Intrinsic::x86_avx_vtestz_ps_256:

27760

case Intrinsic::x86_avx_vtestz_pd_256:

27761

TestOpc = X86ISD::TESTP;

27762

[[fallthrough]];

27763

case Intrinsic::x86_sse41_ptestz:

27764

case Intrinsic::x86_avx_ptestz_256:

27765

// ZF = 1

27766

X86CC = X86::COND_E;

27767

break;

27768

case Intrinsic::x86_avx_vtestc_ps:

27769

case Intrinsic::x86_avx_vtestc_pd:

27770

case Intrinsic::x86_avx_vtestc_ps_256:

27771

case Intrinsic::x86_avx_vtestc_pd_256:

27772

TestOpc = X86ISD::TESTP;

27773

[[fallthrough]];

27774

case Intrinsic::x86_sse41_ptestc:

27775

case Intrinsic::x86_avx_ptestc_256:

27776

// CF = 1

27777

X86CC = X86::COND_B;

27778

break;

27779

case Intrinsic::x86_avx_vtestnzc_ps:

27780

case Intrinsic::x86_avx_vtestnzc_pd:

27781

case Intrinsic::x86_avx_vtestnzc_ps_256:

27782

case Intrinsic::x86_avx_vtestnzc_pd_256:

27783

TestOpc = X86ISD::TESTP;

27784

[[fallthrough]];

27785

case Intrinsic::x86_sse41_ptestnzc:

27786

case Intrinsic::x86_avx_ptestnzc_256:

27787

// ZF and CF = 0

27788

X86CC = X86::COND_A;

27789

break;

27790

}

27791

27792

SDValue LHS = Op.getOperand(1);

27793

SDValue RHS = Op.getOperand(2);

27794

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

27795

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

27796

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27797

}

27798

27799

case Intrinsic::x86_sse42_pcmpistria128:

27800

case Intrinsic::x86_sse42_pcmpestria128:

27801

case Intrinsic::x86_sse42_pcmpistric128:

27802

case Intrinsic::x86_sse42_pcmpestric128:

27803

case Intrinsic::x86_sse42_pcmpistrio128:

27804

case Intrinsic::x86_sse42_pcmpestrio128:

27805

case Intrinsic::x86_sse42_pcmpistris128:

27806

case Intrinsic::x86_sse42_pcmpestris128:

27807

case Intrinsic::x86_sse42_pcmpistriz128:

27808

case Intrinsic::x86_sse42_pcmpestriz128: {

27809

unsigned Opcode;

27810

X86::CondCode X86CC;

27811

switch (IntNo) {

27812

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27812); // Can't reach here.

27813

case Intrinsic::x86_sse42_pcmpistria128:

27814

Opcode = X86ISD::PCMPISTR;

27815

X86CC = X86::COND_A;

27816

break;

27817

case Intrinsic::x86_sse42_pcmpestria128:

27818

Opcode = X86ISD::PCMPESTR;

27819

X86CC = X86::COND_A;

27820

break;

27821

case Intrinsic::x86_sse42_pcmpistric128:

27822

Opcode = X86ISD::PCMPISTR;

27823

X86CC = X86::COND_B;

27824

break;

27825

case Intrinsic::x86_sse42_pcmpestric128:

27826

Opcode = X86ISD::PCMPESTR;

27827

X86CC = X86::COND_B;

27828

break;

27829

case Intrinsic::x86_sse42_pcmpistrio128:

27830

Opcode = X86ISD::PCMPISTR;

27831

X86CC = X86::COND_O;

27832

break;

27833

case Intrinsic::x86_sse42_pcmpestrio128:

27834

Opcode = X86ISD::PCMPESTR;

27835

X86CC = X86::COND_O;

27836

break;

27837

case Intrinsic::x86_sse42_pcmpistris128:

27838

Opcode = X86ISD::PCMPISTR;

27839

X86CC = X86::COND_S;

27840

break;

27841

case Intrinsic::x86_sse42_pcmpestris128:

27842

Opcode = X86ISD::PCMPESTR;

27843

X86CC = X86::COND_S;

27844

break;

27845

case Intrinsic::x86_sse42_pcmpistriz128:

27846

Opcode = X86ISD::PCMPISTR;

27847

X86CC = X86::COND_E;

27848

break;

27849

case Intrinsic::x86_sse42_pcmpestriz128:

27850

Opcode = X86ISD::PCMPESTR;

27851

X86CC = X86::COND_E;

27852

break;

27853

}

27854

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

27855

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

27856

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

27857

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

27858

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27859

}

27860

27861

case Intrinsic::x86_sse42_pcmpistri128:

27862

case Intrinsic::x86_sse42_pcmpestri128: {

27863

unsigned Opcode;

27864

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

27865

Opcode = X86ISD::PCMPISTR;

27866

else

27867

Opcode = X86ISD::PCMPESTR;

27868

27869

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

27870

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

27871

return DAG.getNode(Opcode, dl, VTs, NewOps);

27872

}

27873

27874

case Intrinsic::x86_sse42_pcmpistrm128:

27875

case Intrinsic::x86_sse42_pcmpestrm128: {

27876

unsigned Opcode;

27877

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

27878

Opcode = X86ISD::PCMPISTR;

27879

else

27880

Opcode = X86ISD::PCMPESTR;

27881

27882

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

27883

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

27884

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

27885

}

27886

27887

case Intrinsic::eh_sjlj_lsda: {

27888

MachineFunction &MF = DAG.getMachineFunction();

27889

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27890

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

27891

auto &Context = MF.getMMI().getContext();

27892

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

27893

Twine(MF.getFunctionNumber()));

27894

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

27895

DAG.getMCSymbol(S, PtrVT));

27896

}

27897

27898

case Intrinsic::x86_seh_lsda: {

27899

// Compute the symbol for the LSDA. We know it'll get emitted later.

27900

MachineFunction &MF = DAG.getMachineFunction();

27901

SDValue Op1 = Op.getOperand(1);

27902

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

27903

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

27904

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

27905

27906

// Generate a simple absolute symbol reference. This intrinsic is only

27907

// supported on 32-bit Windows, which isn't PIC.

27908

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

27909

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

27910

}

27911

27912

case Intrinsic::eh_recoverfp: {

27913

SDValue FnOp = Op.getOperand(1);

27914

SDValue IncomingFPOp = Op.getOperand(2);

27915

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

27916

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

27917

if (!Fn)

27918

report_fatal_error(

27919

"llvm.eh.recoverfp must take a function as the first argument");

27920

return recoverFramePointer(DAG, Fn, IncomingFPOp);

27921

}

27922

27923

case Intrinsic::localaddress: {

27924

// Returns one of the stack, base, or frame pointer registers, depending on

27925

// which is used to reference local variables.

27926

MachineFunction &MF = DAG.getMachineFunction();

27927

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27928

unsigned Reg;

27929

if (RegInfo->hasBasePointer(MF))

27930

Reg = RegInfo->getBaseRegister();

27931

else { // Handles the SP or FP case.

27932

bool CantUseFP = RegInfo->hasStackRealignment(MF);

27933

if (CantUseFP)

27934

Reg = RegInfo->getPtrSizedStackRegister(MF);

27935

else

27936

Reg = RegInfo->getPtrSizedFrameRegister(MF);

27937

}

27938

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

27939

}

27940

case Intrinsic::x86_avx512_vp2intersect_q_512:

27941

case Intrinsic::x86_avx512_vp2intersect_q_256:

27942

case Intrinsic::x86_avx512_vp2intersect_q_128:

27943

case Intrinsic::x86_avx512_vp2intersect_d_512:

27944

case Intrinsic::x86_avx512_vp2intersect_d_256:

27945

case Intrinsic::x86_avx512_vp2intersect_d_128: {

27946

MVT MaskVT = Op.getSimpleValueType();

27947

27948

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

27949

SDLoc DL(Op);

27950

27951

SDValue Operation =

27952

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

27953

Op->getOperand(1), Op->getOperand(2));

27954

27955

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

27956

MaskVT, Operation);

27957

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

27958

MaskVT, Operation);

27959

return DAG.getMergeValues({Result0, Result1}, DL);

27960

}

27961

case Intrinsic::x86_mmx_pslli_w:

27962

case Intrinsic::x86_mmx_pslli_d:

27963

case Intrinsic::x86_mmx_pslli_q:

27964

case Intrinsic::x86_mmx_psrli_w:

27965

case Intrinsic::x86_mmx_psrli_d:

27966

case Intrinsic::x86_mmx_psrli_q:

27967

case Intrinsic::x86_mmx_psrai_w:

27968

case Intrinsic::x86_mmx_psrai_d: {

27969

SDLoc DL(Op);

27970

SDValue ShAmt = Op.getOperand(2);

27971

// If the argument is a constant, convert it to a target constant.

27972

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

27973

// Clamp out of bounds shift amounts since they will otherwise be masked

27974

// to 8-bits which may make it no longer out of bounds.

27975

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

27976

if (ShiftAmount == 0)

27977

return Op.getOperand(1);

27978

27979

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

27980

Op.getOperand(0), Op.getOperand(1),

27981

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

27982

}

27983

27984

unsigned NewIntrinsic;

27985

switch (IntNo) {

27986

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27986); // Can't reach here.

27987

case Intrinsic::x86_mmx_pslli_w:

27988

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

27989

break;

27990

case Intrinsic::x86_mmx_pslli_d:

27991

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

27992

break;

27993

case Intrinsic::x86_mmx_pslli_q:

27994

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

27995

break;

27996

case Intrinsic::x86_mmx_psrli_w:

27997

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

27998

break;

27999

case Intrinsic::x86_mmx_psrli_d:

28000

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

28001

break;

28002

case Intrinsic::x86_mmx_psrli_q:

28003

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

28004

break;

28005

case Intrinsic::x86_mmx_psrai_w:

28006

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

28007

break;

28008

case Intrinsic::x86_mmx_psrai_d:

28009

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

28010

break;

28011

}

28012

28013

// The vector shift intrinsics with scalars uses 32b shift amounts but

28014

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

28015

// MMX register.

28016

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

28017

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28018

DAG.getTargetConstant(NewIntrinsic, DL,

28019

getPointerTy(DAG.getDataLayout())),

28020

Op.getOperand(1), ShAmt);

28021

}

28022

case Intrinsic::thread_pointer: {

28023

if (Subtarget.isTargetELF()) {

28024

SDLoc dl(Op);

28025

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28026

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

28027

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(

28028

*DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

28029

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

28030

DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

28031

}

28032

report_fatal_error(

28033

"Target OS doesn't support __builtin_thread_pointer() yet.");

28034

}

28035

}

28036

}

28037

28038

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28039

SDValue Src, SDValue Mask, SDValue Base,

28040

SDValue Index, SDValue ScaleOp, SDValue Chain,

28041

const X86Subtarget &Subtarget) {

28042

SDLoc dl(Op);

28043

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28044

// Scale must be constant.

28045

if (!C)

28046

return SDValue();

28047

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28048

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28049

TLI.getPointerTy(DAG.getDataLayout()));

28050

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

28051

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28052

// If source is undef or we know it won't be used, use a zero vector

28053

// to break register dependency.

28054

// TODO: use undef instead and let BreakFalseDeps deal with it?

28055

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28056

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28057

28058

// Cast mask to an integer type.

28059

Mask = DAG.getBitcast(MaskVT, Mask);

28060

28061

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28062

28063

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28064

SDValue Res =

28065

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28066

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28067

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28068

}

28069

28070

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

28071

SDValue Src, SDValue Mask, SDValue Base,

28072

SDValue Index, SDValue ScaleOp, SDValue Chain,

28073

const X86Subtarget &Subtarget) {

28074

MVT VT = Op.getSimpleValueType();

28075

SDLoc dl(Op);

28076

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28077

// Scale must be constant.

28078

if (!C)

28079

return SDValue();

28080

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28081

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28082

TLI.getPointerTy(DAG.getDataLayout()));

28083

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28084

VT.getVectorNumElements());

28085

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28086

28087

// We support two versions of the gather intrinsics. One with scalar mask and

28088

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28089

if (Mask.getValueType() != MaskVT)

28090

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28091

28092

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28093

// If source is undef or we know it won't be used, use a zero vector

28094

// to break register dependency.

28095

// TODO: use undef instead and let BreakFalseDeps deal with it?

28096

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28097

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28098

28099

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28100

28101

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28102

SDValue Res =

28103

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28104

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28105

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28106

}

28107

28108

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28109

SDValue Src, SDValue Mask, SDValue Base,

28110

SDValue Index, SDValue ScaleOp, SDValue Chain,

28111

const X86Subtarget &Subtarget) {

28112

SDLoc dl(Op);

28113

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28114

// Scale must be constant.

28115

if (!C)

28116

return SDValue();

28117

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28118

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28119

TLI.getPointerTy(DAG.getDataLayout()));

28120

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28121

Src.getSimpleValueType().getVectorNumElements());

28122

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28123

28124

// We support two versions of the scatter intrinsics. One with scalar mask and

28125

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28126

if (Mask.getValueType() != MaskVT)

28127

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28128

28129

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28130

28131

SDVTList VTs = DAG.getVTList(MVT::Other);

28132

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

28133

SDValue Res =

28134

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

28135

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28136

return Res;

28137

}

28138

28139

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28140

SDValue Mask, SDValue Base, SDValue Index,

28141

SDValue ScaleOp, SDValue Chain,

28142

const X86Subtarget &Subtarget) {

28143

SDLoc dl(Op);

28144

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28145

// Scale must be constant.

28146

if (!C)

28147

return SDValue();

28148

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28149

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28150

TLI.getPointerTy(DAG.getDataLayout()));

28151

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

28152

SDValue Segment = DAG.getRegister(0, MVT::i32);

28153

MVT MaskVT =

28154

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

28155

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28156

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

28157

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

28158

return SDValue(Res, 0);

28159

}

28160

28161

/// Handles the lowering of builtin intrinsics with chain that return their

28162

/// value into registers EDX:EAX.

28163

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

28164

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

28165

/// TargetOpcode.

28166

/// Returns a Glue value which can be used to add extra copy-from-reg if the

28167

/// expanded intrinsics implicitly defines extra registers (i.e. not just

28168

/// EDX:EAX).

28169

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

28170

SelectionDAG &DAG,

28171

unsigned TargetOpcode,

28172

unsigned SrcReg,

28173

const X86Subtarget &Subtarget,

28174

SmallVectorImpl<SDValue> &Results) {

28175

SDValue Chain = N->getOperand(0);

28176

SDValue Glue;

28177

28178

if (SrcReg) {

28179

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28179, __extension__
__PRETTY_FUNCTION__));

28180

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

28181

Glue = Chain.getValue(1);

28182

}

28183

28184

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

28185

SDValue N1Ops[] = {Chain, Glue};

28186

SDNode *N1 = DAG.getMachineNode(

28187

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

28188

Chain = SDValue(N1, 0);

28189

28190

// Reads the content of XCR and returns it in registers EDX:EAX.

28191

SDValue LO, HI;

28192

if (Subtarget.is64Bit()) {

28193

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

28194

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

28195

LO.getValue(2));

28196

} else {

28197

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

28198

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

28199

LO.getValue(2));

28200

}

28201

Chain = HI.getValue(1);

28202

Glue = HI.getValue(2);

28203

28204

if (Subtarget.is64Bit()) {

28205

// Merge the two 32-bit values into a 64-bit one.

28206

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

28207

DAG.getConstant(32, DL, MVT::i8));

28208

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

28209

Results.push_back(Chain);

28210

return Glue;

28211

}

28212

28213

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

28214

SDValue Ops[] = { LO, HI };

28215

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

28216

Results.push_back(Pair);

28217

Results.push_back(Chain);

28218

return Glue;

28219

}

28220

28221

/// Handles the lowering of builtin intrinsics that read the time stamp counter

28222

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

28223

/// READCYCLECOUNTER nodes.

28224

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

28225

SelectionDAG &DAG,

28226

const X86Subtarget &Subtarget,

28227

SmallVectorImpl<SDValue> &Results) {

28228

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

28229

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

28230

// and the EAX register is loaded with the low-order 32 bits.

28231

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

28232

/* NoRegister */0, Subtarget,

28233

Results);

28234

if (Opcode != X86::RDTSCP)

28235

return;

28236

28237

SDValue Chain = Results[1];

28238

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

28239

// the ECX register. Add 'ecx' explicitly to the chain.

28240

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

28241

Results[1] = ecx;

28242

Results.push_back(ecx.getValue(1));

28243

}

28244

28245

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

28246

SelectionDAG &DAG) {

28247

SmallVector<SDValue, 3> Results;

28248

SDLoc DL(Op);

28249

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

28250

Results);

28251

return DAG.getMergeValues(Results, DL);

28252

}

28253

28254

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

28255

MachineFunction &MF = DAG.getMachineFunction();

28256

SDValue Chain = Op.getOperand(0);

28257

SDValue RegNode = Op.getOperand(2);

28258

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28259

if (!EHInfo)

28260

report_fatal_error("EH registrations only live in functions using WinEH");

28261

28262

// Cast the operand to an alloca, and remember the frame index.

28263

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

28264

if (!FINode)

28265

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

28266

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

28267

28268

// Return the chain operand without making any DAG nodes.

28269

return Chain;

28270

}

28271

28272

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

28273

MachineFunction &MF = DAG.getMachineFunction();

28274

SDValue Chain = Op.getOperand(0);

28275

SDValue EHGuard = Op.getOperand(2);

28276

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28277

if (!EHInfo)

28278

report_fatal_error("EHGuard only live in functions using WinEH");

28279

28280

// Cast the operand to an alloca, and remember the frame index.

28281

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

28282

if (!FINode)

28283

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

28284

EHInfo->EHGuardFrameIndex = FINode->getIndex();

28285

28286

// Return the chain operand without making any DAG nodes.

28287

return Chain;

28288

}

28289

28290

/// Emit Truncating Store with signed or unsigned saturation.

28291

static SDValue

28292

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

28293

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

28294

SelectionDAG &DAG) {

28295

SDVTList VTs = DAG.getVTList(MVT::Other);

28296

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

28297

SDValue Ops[] = { Chain, Val, Ptr, Undef };

28298

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

28299

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28300

}

28301

28302

/// Emit Masked Truncating Store with signed or unsigned saturation.

28303

static SDValue

28304

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

28305

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

28306

MachineMemOperand *MMO, SelectionDAG &DAG) {

28307

SDVTList VTs = DAG.getVTList(MVT::Other);

28308

SDValue Ops[] = { Chain, Val, Ptr, Mask };

28309

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

28310

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28311

}

28312

28313

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

28314

SelectionDAG &DAG) {

28315

unsigned IntNo = Op.getConstantOperandVal(1);

28316

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

28317

if (!IntrData) {

28318

switch (IntNo) {

28319

28320

case Intrinsic::swift_async_context_addr: {

28321

SDLoc dl(Op);

28322

auto &MF = DAG.getMachineFunction();

28323

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

28324

if (Subtarget.is64Bit()) {

28325

MF.getFrameInfo().setFrameAddressIsTaken(true);

28326

X86FI->setHasSwiftAsyncContext(true);

28327

SDValue Chain = Op->getOperand(0);

28328

SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);

28329

SDValue Result =

28330

SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,

28331

DAG.getTargetConstant(8, dl, MVT::i32)),

28332

0);

28333

// Return { result, chain }.

28334

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28335

CopyRBP.getValue(1));

28336

} else {

28337

// 32-bit so no special extended frame, create or reuse an existing

28338

// stack slot.

28339

if (!X86FI->getSwiftAsyncContextFrameIdx())

28340

X86FI->setSwiftAsyncContextFrameIdx(

28341

MF.getFrameInfo().CreateStackObject(4, Align(4), false));

28342

SDValue Result =

28343

DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);

28344

// Return { result, chain }.

28345

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28346

Op->getOperand(0));

28347

}

28348

}

28349

28350

case llvm::Intrinsic::x86_seh_ehregnode:

28351

return MarkEHRegistrationNode(Op, DAG);

28352

case llvm::Intrinsic::x86_seh_ehguard:

28353

return MarkEHGuard(Op, DAG);

28354

case llvm::Intrinsic::x86_rdpkru: {

28355

SDLoc dl(Op);

28356

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28357

// Create a RDPKRU node and pass 0 to the ECX parameter.

28358

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

28359

DAG.getConstant(0, dl, MVT::i32));

28360

}

28361

case llvm::Intrinsic::x86_wrpkru: {

28362

SDLoc dl(Op);

28363

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

28364

// to the EDX and ECX parameters.

28365

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

28366

Op.getOperand(0), Op.getOperand(2),

28367

DAG.getConstant(0, dl, MVT::i32),

28368

DAG.getConstant(0, dl, MVT::i32));

28369

}

28370

case llvm::Intrinsic::asan_check_memaccess: {

28371

// Mark this as adjustsStack because it will be lowered to a call.

28372

DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

28373

// Don't do anything here, we will expand these intrinsics out later.

28374

return Op;

28375

}

28376

case llvm::Intrinsic::x86_flags_read_u32:

28377

case llvm::Intrinsic::x86_flags_read_u64:

28378

case llvm::Intrinsic::x86_flags_write_u32:

28379

case llvm::Intrinsic::x86_flags_write_u64: {

28380

// We need a frame pointer because this will get lowered to a PUSH/POP

28381

// sequence.

28382

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28383

MFI.setHasCopyImplyingStackAdjustment(true);

28384

// Don't do anything here, we will expand these intrinsics out later

28385

// during FinalizeISel in EmitInstrWithCustomInserter.

28386

return Op;

28387

}

28388

case Intrinsic::x86_lwpins32:

28389

case Intrinsic::x86_lwpins64:

28390

case Intrinsic::x86_umwait:

28391

case Intrinsic::x86_tpause: {

28392

SDLoc dl(Op);

28393

SDValue Chain = Op->getOperand(0);

28394

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28395

unsigned Opcode;

28396

28397

switch (IntNo) {

28398

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28398);

28399

case Intrinsic::x86_umwait:

28400

Opcode = X86ISD::UMWAIT;

28401

break;

28402

case Intrinsic::x86_tpause:

28403

Opcode = X86ISD::TPAUSE;

28404

break;

28405

case Intrinsic::x86_lwpins32:

28406

case Intrinsic::x86_lwpins64:

28407

Opcode = X86ISD::LWPINS;

28408

break;

28409

}

28410

28411

SDValue Operation =

28412

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

28413

Op->getOperand(3), Op->getOperand(4));

28414

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28415

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28416

Operation.getValue(1));

28417

}

28418

case Intrinsic::x86_enqcmd:

28419

case Intrinsic::x86_enqcmds: {

28420

SDLoc dl(Op);

28421

SDValue Chain = Op.getOperand(0);

28422

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28423

unsigned Opcode;

28424

switch (IntNo) {

28425

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28425);

28426

case Intrinsic::x86_enqcmd:

28427

Opcode = X86ISD::ENQCMD;

28428

break;

28429

case Intrinsic::x86_enqcmds:

28430

Opcode = X86ISD::ENQCMDS;

28431

break;

28432

}

28433

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

28434

Op.getOperand(3));

28435

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

28436

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28437

Operation.getValue(1));

28438

}

28439

case Intrinsic::x86_aesenc128kl:

28440

case Intrinsic::x86_aesdec128kl:

28441

case Intrinsic::x86_aesenc256kl:

28442

case Intrinsic::x86_aesdec256kl: {

28443

SDLoc DL(Op);

28444

SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

28445

SDValue Chain = Op.getOperand(0);

28446

unsigned Opcode;

28447

28448

switch (IntNo) {

28449

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28449);

28450

case Intrinsic::x86_aesenc128kl:

28451

Opcode = X86ISD::AESENC128KL;

28452

break;

28453

case Intrinsic::x86_aesdec128kl:

28454

Opcode = X86ISD::AESDEC128KL;

28455

break;

28456

case Intrinsic::x86_aesenc256kl:

28457

Opcode = X86ISD::AESENC256KL;

28458

break;

28459

case Intrinsic::x86_aesdec256kl:

28460

Opcode = X86ISD::AESDEC256KL;

28461

break;

28462

}

28463

28464

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28465

MachineMemOperand *MMO = MemIntr->getMemOperand();

28466

EVT MemVT = MemIntr->getMemoryVT();

28467

SDValue Operation = DAG.getMemIntrinsicNode(

28468

Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

28469

MMO);

28470

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);

28471

28472

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28473

{ZF, Operation.getValue(0), Operation.getValue(2)});

28474

}

28475

case Intrinsic::x86_aesencwide128kl:

28476

case Intrinsic::x86_aesdecwide128kl:

28477

case Intrinsic::x86_aesencwide256kl:

28478

case Intrinsic::x86_aesdecwide256kl: {

28479

SDLoc DL(Op);

28480

SDVTList VTs = DAG.getVTList(

28481

{MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

28482

MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

28483

SDValue Chain = Op.getOperand(0);

28484

unsigned Opcode;

28485

28486

switch (IntNo) {

28487

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28487);

28488

case Intrinsic::x86_aesencwide128kl:

28489

Opcode = X86ISD::AESENCWIDE128KL;

28490

break;

28491

case Intrinsic::x86_aesdecwide128kl:

28492

Opcode = X86ISD::AESDECWIDE128KL;

28493

break;

28494

case Intrinsic::x86_aesencwide256kl:

28495

Opcode = X86ISD::AESENCWIDE256KL;

28496

break;

28497

case Intrinsic::x86_aesdecwide256kl:

28498

Opcode = X86ISD::AESDECWIDE256KL;

28499

break;

28500

}

28501

28502

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28503

MachineMemOperand *MMO = MemIntr->getMemOperand();

28504

EVT MemVT = MemIntr->getMemoryVT();

28505

SDValue Operation = DAG.getMemIntrinsicNode(

28506

Opcode, DL, VTs,

28507

{Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

28508

Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

28509

Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

28510

MemVT, MMO);

28511

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);

28512

28513

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28514

{ZF, Operation.getValue(1), Operation.getValue(2),

28515

Operation.getValue(3), Operation.getValue(4),

28516

Operation.getValue(5), Operation.getValue(6),

28517

Operation.getValue(7), Operation.getValue(8),

28518

Operation.getValue(9)});

28519

}

28520

case Intrinsic::x86_testui: {

28521

SDLoc dl(Op);

28522

SDValue Chain = Op.getOperand(0);

28523

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28524

SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

28525

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28526

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28527

Operation.getValue(1));

28528

}

28529

case Intrinsic::x86_atomic_bts_rm:

28530

case Intrinsic::x86_atomic_btc_rm:

28531

case Intrinsic::x86_atomic_btr_rm: {

28532

SDLoc DL(Op);

28533

MVT VT = Op.getSimpleValueType();

28534

SDValue Chain = Op.getOperand(0);

28535

SDValue Op1 = Op.getOperand(2);

28536

SDValue Op2 = Op.getOperand(3);

28537

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM

28538

: IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM

28539

: X86ISD::LBTR_RM;

28540

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28541

SDValue Res =

28542

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

28543

{Chain, Op1, Op2}, VT, MMO);

28544

Chain = Res.getValue(1);

28545

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

28546

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

28547

}

28548

case Intrinsic::x86_atomic_bts:

28549

case Intrinsic::x86_atomic_btc:

28550

case Intrinsic::x86_atomic_btr: {

28551

SDLoc DL(Op);

28552

MVT VT = Op.getSimpleValueType();

28553

SDValue Chain = Op.getOperand(0);

28554

SDValue Op1 = Op.getOperand(2);

28555

SDValue Op2 = Op.getOperand(3);

28556

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS

28557

: IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

28558

: X86ISD::LBTR;

28559

SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

28560

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28561

SDValue Res =

28562

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

28563

{Chain, Op1, Op2, Size}, VT, MMO);

28564

Chain = Res.getValue(1);

28565

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

28566

unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();

28567

if (Imm)

28568

Res = DAG.getNode(ISD::SHL, DL, VT, Res,

28569

DAG.getShiftAmountConstant(Imm, VT, DL));

28570

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

28571

}

28572

case Intrinsic::x86_cmpccxadd32:

28573

case Intrinsic::x86_cmpccxadd64: {

28574

SDLoc DL(Op);

28575

SDValue Chain = Op.getOperand(0);

28576

SDValue Addr = Op.getOperand(2);

28577

SDValue Src1 = Op.getOperand(3);

28578

SDValue Src2 = Op.getOperand(4);

28579

SDValue CC = Op.getOperand(5);

28580

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28581

SDValue Operation = DAG.getMemIntrinsicNode(

28582

X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},

28583

MVT::i32, MMO);

28584

return Operation;

28585

}

28586

case Intrinsic::x86_aadd32:

28587

case Intrinsic::x86_aadd64:

28588

case Intrinsic::x86_aand32:

28589

case Intrinsic::x86_aand64:

28590

case Intrinsic::x86_aor32:

28591

case Intrinsic::x86_aor64:

28592

case Intrinsic::x86_axor32:

28593

case Intrinsic::x86_axor64: {

28594

SDLoc DL(Op);

28595

SDValue Chain = Op.getOperand(0);

28596

SDValue Op1 = Op.getOperand(2);

28597

SDValue Op2 = Op.getOperand(3);

28598

MVT VT = Op2.getSimpleValueType();

28599

unsigned Opc = 0;

28600

switch (IntNo) {

28601

default:

28602

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28602);

28603

case Intrinsic::x86_aadd32:

28604

case Intrinsic::x86_aadd64:

28605

Opc = X86ISD::AADD;

28606

break;

28607

case Intrinsic::x86_aand32:

28608

case Intrinsic::x86_aand64:

28609

Opc = X86ISD::AAND;

28610

break;

28611

case Intrinsic::x86_aor32:

28612

case Intrinsic::x86_aor64:

28613

Opc = X86ISD::AOR;

28614

break;

28615

case Intrinsic::x86_axor32:

28616

case Intrinsic::x86_axor64:

28617

Opc = X86ISD::AXOR;

28618

break;

28619

}

28620

MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();

28621

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),

28622

{Chain, Op1, Op2}, VT, MMO);

28623

}

28624

case Intrinsic::x86_atomic_add_cc:

28625

case Intrinsic::x86_atomic_sub_cc:

28626

case Intrinsic::x86_atomic_or_cc:

28627

case Intrinsic::x86_atomic_and_cc:

28628

case Intrinsic::x86_atomic_xor_cc: {

28629

SDLoc DL(Op);

28630

SDValue Chain = Op.getOperand(0);

28631

SDValue Op1 = Op.getOperand(2);

28632

SDValue Op2 = Op.getOperand(3);

28633

X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);

28634

MVT VT = Op2.getSimpleValueType();

28635

unsigned Opc = 0;

28636

switch (IntNo) {

28637

default:

28638

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28638);

28639

case Intrinsic::x86_atomic_add_cc:

28640

Opc = X86ISD::LADD;

28641

break;

28642

case Intrinsic::x86_atomic_sub_cc:

28643

Opc = X86ISD::LSUB;

28644

break;

28645

case Intrinsic::x86_atomic_or_cc:

28646

Opc = X86ISD::LOR;

28647

break;

28648

case Intrinsic::x86_atomic_and_cc:

28649

Opc = X86ISD::LAND;

28650

break;

28651

case Intrinsic::x86_atomic_xor_cc:

28652

Opc = X86ISD::LXOR;

28653

break;

28654

}

28655

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28656

SDValue LockArith =

28657

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

28658

{Chain, Op1, Op2}, VT, MMO);

28659

Chain = LockArith.getValue(1);

28660

return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);

28661

}

28662

}

28663

return SDValue();

28664

}

28665

28666

SDLoc dl(Op);

28667

switch(IntrData->Type) {

28668

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28668);

28669

case RDSEED:

28670

case RDRAND: {

28671

// Emit the node with the right value type.

28672

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

28673

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

28674

28675

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

28676

// Otherwise return the value from Rand, which is always 0, casted to i32.

28677

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

28678

DAG.getConstant(1, dl, Op->getValueType(1)),

28679

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

28680

SDValue(Result.getNode(), 1)};

28681

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

28682

28683

// Return { result, isValid, chain }.

28684

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

28685

SDValue(Result.getNode(), 2));

28686

}

28687

case GATHER_AVX2: {

28688

SDValue Chain = Op.getOperand(0);

28689

SDValue Src = Op.getOperand(2);

28690

SDValue Base = Op.getOperand(3);

28691

SDValue Index = Op.getOperand(4);

28692

SDValue Mask = Op.getOperand(5);

28693

SDValue Scale = Op.getOperand(6);

28694

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

28695

Scale, Chain, Subtarget);

28696

}

28697

case GATHER: {

28698

//gather(v1, mask, index, base, scale);

28699

SDValue Chain = Op.getOperand(0);

28700

SDValue Src = Op.getOperand(2);

28701

SDValue Base = Op.getOperand(3);

28702

SDValue Index = Op.getOperand(4);

28703

SDValue Mask = Op.getOperand(5);

28704

SDValue Scale = Op.getOperand(6);

28705

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

28706

Chain, Subtarget);

28707

}

28708

case SCATTER: {

28709

//scatter(base, mask, index, v1, scale);

28710

SDValue Chain = Op.getOperand(0);

28711

SDValue Base = Op.getOperand(2);

28712

SDValue Mask = Op.getOperand(3);

28713

SDValue Index = Op.getOperand(4);

28714

SDValue Src = Op.getOperand(5);

28715

SDValue Scale = Op.getOperand(6);

28716

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

28717

Scale, Chain, Subtarget);

28718

}

28719

case PREFETCH: {

28720

const APInt &HintVal = Op.getConstantOperandAPInt(6);

28721

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28722, __extension__
__PRETTY_FUNCTION__))

28722

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28722, __extension__
__PRETTY_FUNCTION__));

28723

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

28724

SDValue Chain = Op.getOperand(0);

28725

SDValue Mask = Op.getOperand(2);

28726

SDValue Index = Op.getOperand(3);

28727

SDValue Base = Op.getOperand(4);

28728

SDValue Scale = Op.getOperand(5);

28729

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

28730

Subtarget);

28731

}

28732

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

28733

case RDTSC: {

28734

SmallVector<SDValue, 2> Results;

28735

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

28736

Results);

28737

return DAG.getMergeValues(Results, dl);

28738

}

28739

// Read Performance Monitoring Counters.

28740

case RDPMC:

28741

// Read Processor Register.

28742

case RDPRU:

28743

// GetExtended Control Register.

28744

case XGETBV: {

28745

SmallVector<SDValue, 2> Results;

28746

28747

// RDPMC uses ECX to select the index of the performance counter to read.

28748

// RDPRU uses ECX to select the processor register to read.

28749

// XGETBV uses ECX to select the index of the XCR register to return.

28750

// The result is stored into registers EDX:EAX.

28751

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

28752

Subtarget, Results);

28753

return DAG.getMergeValues(Results, dl);

28754

}

28755

// XTEST intrinsics.

28756

case XTEST: {

28757

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

28758

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

28759

28760

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

28761

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

28762

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

28763

Ret, SDValue(InTrans.getNode(), 1));

28764

}

28765

case TRUNCATE_TO_MEM_VI8:

28766

case TRUNCATE_TO_MEM_VI16:

28767

case TRUNCATE_TO_MEM_VI32: {

28768

SDValue Mask = Op.getOperand(4);

28769

SDValue DataToTruncate = Op.getOperand(3);

28770

SDValue Addr = Op.getOperand(2);

28771

SDValue Chain = Op.getOperand(0);

28772

28773

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

28774

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28774, __extension__
__PRETTY_FUNCTION__));

28775

28776

EVT MemVT = MemIntr->getMemoryVT();

28777

28778

uint16_t TruncationOp = IntrData->Opc0;

28779

switch (TruncationOp) {

28780

case X86ISD::VTRUNC: {

28781

if (isAllOnesConstant(Mask)) // return just a truncate store

28782

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

28783

MemIntr->getMemOperand());

28784

28785

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

28786

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28787

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

28788

28789

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

28790

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

28791

true /* truncating */);

28792

}

28793

case X86ISD::VTRUNCUS:

28794

case X86ISD::VTRUNCS: {

28795

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

28796

if (isAllOnesConstant(Mask))

28797

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

28798

MemIntr->getMemOperand(), DAG);

28799

28800

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

28801

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28802

28803

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

28804

VMask, MemVT, MemIntr->getMemOperand(), DAG);

28805

}

28806

default:

28807

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28807);

28808

}

28809

}

28810

}

28811

}

28812

28813

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

28814

SelectionDAG &DAG) const {

28815

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28816

MFI.setReturnAddressIsTaken(true);

28817

28818

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

28819

return SDValue();

28820

28821

unsigned Depth = Op.getConstantOperandVal(0);

28822

SDLoc dl(Op);

28823

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28824

28825

if (Depth > 0) {

28826

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

28827

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28828

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

28829

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

28830

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

28831

MachinePointerInfo());

28832

}

28833

28834

// Just load the return address.

28835

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

28836

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

28837

MachinePointerInfo());

28838

}

28839

28840

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

28841

SelectionDAG &DAG) const {

28842

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

28843

return getReturnAddressFrameIndex(DAG);

28844

}

28845

28846

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

28847

MachineFunction &MF = DAG.getMachineFunction();

28848

MachineFrameInfo &MFI = MF.getFrameInfo();

28849

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

28850

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28851

EVT VT = Op.getValueType();

28852

28853

MFI.setFrameAddressIsTaken(true);

28854

28855

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

28856

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

28857

// is not possible to crawl up the stack without looking at the unwind codes

28858

// simultaneously.

28859

int FrameAddrIndex = FuncInfo->getFAIndex();

28860

if (!FrameAddrIndex) {

28861

// Set up a frame object for the return address.

28862

unsigned SlotSize = RegInfo->getSlotSize();

28863

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

28864

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

28865

FuncInfo->setFAIndex(FrameAddrIndex);

28866

}

28867

return DAG.getFrameIndex(FrameAddrIndex, VT);

28868

}

28869

28870

unsigned FrameReg =

28871

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

28872

SDLoc dl(Op); // FIXME probably not meaningful

28873

unsigned Depth = Op.getConstantOperandVal(0);

28874

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28876, __extension__
__PRETTY_FUNCTION__))

28875

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28876, __extension__
__PRETTY_FUNCTION__))

28876

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28876, __extension__
__PRETTY_FUNCTION__));

28877

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

28878

while (Depth--)

28879

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

28880

MachinePointerInfo());

28881

return FrameAddr;

28882

}

28883

28884

// FIXME? Maybe this could be a TableGen attribute on some registers and

28885

// this table could be generated automatically from RegInfo.

28886

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

28887

const MachineFunction &MF) const {

28888

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

28889

28890

Register Reg = StringSwitch<unsigned>(RegName)

28891

.Case("esp", X86::ESP)

28892

.Case("rsp", X86::RSP)

28893

.Case("ebp", X86::EBP)

28894

.Case("rbp", X86::RBP)

28895

.Default(0);

28896

28897

if (Reg == X86::EBP || Reg == X86::RBP) {

28898

if (!TFI.hasFP(MF))

28899

report_fatal_error("register " + StringRef(RegName) +

28900

" is allocatable: function has no frame pointer");

28901

#ifndef NDEBUG

28902

else {

28903

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28904

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

28905

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28906, __extension__
__PRETTY_FUNCTION__))

28906

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28906, __extension__
__PRETTY_FUNCTION__));

28907

}

28908

#endif

28909

}

28910

28911

if (Reg)

28912

return Reg;

28913

28914

report_fatal_error("Invalid register name global variable");

28915

}

28916

28917

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

28918

SelectionDAG &DAG) const {

28919

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28920

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

28921

}

28922

28923

Register X86TargetLowering::getExceptionPointerRegister(

28924

const Constant *PersonalityFn) const {

28925

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

28926

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

28927

28928

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

28929

}

28930

28931

Register X86TargetLowering::getExceptionSelectorRegister(

28932

const Constant *PersonalityFn) const {

28933

// Funclet personalities don't use selectors (the runtime does the selection).

28934

if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

28935

return X86::NoRegister;

28936

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

28937

}

28938

28939

bool X86TargetLowering::needsFixedCatchObjects() const {

28940

return Subtarget.isTargetWin64();

28941

}

28942

28943

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

28944

SDValue Chain = Op.getOperand(0);

28945

SDValue Offset = Op.getOperand(1);

28946

SDValue Handler = Op.getOperand(2);

28947

SDLoc dl (Op);

28948

28949

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28950

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28951

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

28952

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28954, __extension__
__PRETTY_FUNCTION__))

28953

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28954, __extension__
__PRETTY_FUNCTION__))

28954

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28954, __extension__
__PRETTY_FUNCTION__));

28955

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

28956

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

28957

28958

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

28959

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

28960

dl));

28961

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

28962

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

28963

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

28964

28965

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

28966

DAG.getRegister(StoreAddrReg, PtrVT));

28967

}

28968

28969

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

28970

SelectionDAG &DAG) const {

28971

SDLoc DL(Op);

28972

// If the subtarget is not 64bit, we may need the global base reg

28973

// after isel expand pseudo, i.e., after CGBR pass ran.

28974

// Therefore, ask for the GlobalBaseReg now, so that the pass

28975

// inserts the code for us in case we need it.

28976

// Otherwise, we will end up in a situation where we will

28977

// reference a virtual register that is not defined!

28978

if (!Subtarget.is64Bit()) {

28979

const X86InstrInfo *TII = Subtarget.getInstrInfo();

28980

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

28981

}

28982

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

28983

DAG.getVTList(MVT::i32, MVT::Other),

28984

Op.getOperand(0), Op.getOperand(1));

28985

}

28986

28987

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

28988

SelectionDAG &DAG) const {

28989

SDLoc DL(Op);

28990

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

28991

Op.getOperand(0), Op.getOperand(1));

28992

}

28993

28994

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

28995

SelectionDAG &DAG) const {

28996

SDLoc DL(Op);

28997

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

28998

Op.getOperand(0));

28999

}

29000

29001

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

29002

return Op.getOperand(0);

29003

}

29004

29005

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

29006

SelectionDAG &DAG) const {

29007

SDValue Root = Op.getOperand(0);

29008

SDValue Trmp = Op.getOperand(1); // trampoline

29009

SDValue FPtr = Op.getOperand(2); // nested function

29010

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

29011

SDLoc dl (Op);

29012

29013

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

29014

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

29015

29016

if (Subtarget.is64Bit()) {

29017

SDValue OutChains[6];

29018

29019

// Large code-model.

29020

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

29021

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

29022

29023

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

29024

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

29025

29026

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

29027

29028

// Load the pointer to the nested function into R11.

29029

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

29030

SDValue Addr = Trmp;

29031

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29032

Addr, MachinePointerInfo(TrmpAddr));

29033

29034

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29035

DAG.getConstant(2, dl, MVT::i64));

29036

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

29037

MachinePointerInfo(TrmpAddr, 2), Align(2));

29038

29039

// Load the 'nest' parameter value into R10.

29040

// R10 is specified in X86CallingConv.td

29041

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

29042

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29043

DAG.getConstant(10, dl, MVT::i64));

29044

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29045

Addr, MachinePointerInfo(TrmpAddr, 10));

29046

29047

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29048

DAG.getConstant(12, dl, MVT::i64));

29049

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

29050

MachinePointerInfo(TrmpAddr, 12), Align(2));

29051

29052

// Jump to the nested function.

29053

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

29054

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29055

DAG.getConstant(20, dl, MVT::i64));

29056

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29057

Addr, MachinePointerInfo(TrmpAddr, 20));

29058

29059

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

29060

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29061

DAG.getConstant(22, dl, MVT::i64));

29062

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

29063

Addr, MachinePointerInfo(TrmpAddr, 22));

29064

29065

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29066

} else {

29067

const Function *Func =

29068

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

29069

CallingConv::ID CC = Func->getCallingConv();

29070

unsigned NestReg;

29071

29072

switch (CC) {

29073

default:

29074

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29074);

29075

case CallingConv::C:

29076

case CallingConv::X86_StdCall: {

29077

// Pass 'nest' parameter in ECX.

29078

// Must be kept in sync with X86CallingConv.td

29079

NestReg = X86::ECX;

29080

29081

// Check that ECX wasn't needed by an 'inreg' parameter.

29082

FunctionType *FTy = Func->getFunctionType();

29083

const AttributeList &Attrs = Func->getAttributes();

29084

29085

if (!Attrs.isEmpty() && !Func->isVarArg()) {

29086

unsigned InRegCount = 0;

29087

unsigned Idx = 0;

29088

29089

for (FunctionType::param_iterator I = FTy->param_begin(),

29090

E = FTy->param_end(); I != E; ++I, ++Idx)

29091

if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

29092

const DataLayout &DL = DAG.getDataLayout();

29093

// FIXME: should only count parameters that are lowered to integers.

29094

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

29095

}

29096

29097

if (InRegCount > 2) {

29098

report_fatal_error("Nest register in use - reduce number of inreg"

29099

" parameters!");

29100

}

29101

}

29102

break;

29103

}

29104

case CallingConv::X86_FastCall:

29105

case CallingConv::X86_ThisCall:

29106

case CallingConv::Fast:

29107

case CallingConv::Tail:

29108

case CallingConv::SwiftTail:

29109

// Pass 'nest' parameter in EAX.

29110

// Must be kept in sync with X86CallingConv.td

29111

NestReg = X86::EAX;

29112

break;

29113

}

29114

29115

SDValue OutChains[4];

29116

SDValue Addr, Disp;

29117

29118

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29119

DAG.getConstant(10, dl, MVT::i32));

29120

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

29121

29122

// This is storing the opcode for MOV32ri.

29123

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

29124

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

29125

OutChains[0] =

29126

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

29127

Trmp, MachinePointerInfo(TrmpAddr));

29128

29129

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29130

DAG.getConstant(1, dl, MVT::i32));

29131

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

29132

MachinePointerInfo(TrmpAddr, 1), Align(1));

29133

29134

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

29135

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29136

DAG.getConstant(5, dl, MVT::i32));

29137

OutChains[2] =

29138

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

29139

MachinePointerInfo(TrmpAddr, 5), Align(1));

29140

29141

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29142

DAG.getConstant(6, dl, MVT::i32));

29143

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

29144

MachinePointerInfo(TrmpAddr, 6), Align(1));

29145

29146

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29147

}

29148

}

29149

29150

SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,

29151

SelectionDAG &DAG) const {

29152

/*

29153

The rounding mode is in bits 11:10 of FPSR, and has the following

29154

settings:

29155

00 Round to nearest

29156

01 Round to -inf

29157

10 Round to +inf

29158

11 Round to 0

29159

29160

GET_ROUNDING, on the other hand, expects the following:

29161

-1 Undefined

29162

0 Round to 0

29163

1 Round to nearest

29164

2 Round to +inf

29165

3 Round to -inf

29166

29167

To perform the conversion, we use a packed lookup table of the four 2-bit

29168

values that we can index by FPSP[11:10]

29169

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

29170

29171

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

29172

*/

29173

29174

MachineFunction &MF = DAG.getMachineFunction();

29175

MVT VT = Op.getSimpleValueType();

29176

SDLoc DL(Op);

29177

29178

// Save FP Control Word to stack slot

29179

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

29180

SDValue StackSlot =

29181

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

29182

29183

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

29184

29185

SDValue Chain = Op.getOperand(0);

29186

SDValue Ops[] = {Chain, StackSlot};

29187

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

29188

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

29189

Align(2), MachineMemOperand::MOStore);

29190

29191

// Load FP Control Word from stack slot

29192

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

29193

Chain = CWD.getValue(1);

29194

29195

// Mask and turn the control bits into a shift for the lookup table.

29196

SDValue Shift =

29197

DAG.getNode(ISD::SRL, DL, MVT::i16,

29198

DAG.getNode(ISD::AND, DL, MVT::i16,

29199

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

29200

DAG.getConstant(9, DL, MVT::i8));

29201

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

29202

29203

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

29204

SDValue RetVal =

29205

DAG.getNode(ISD::AND, DL, MVT::i32,

29206

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

29207

DAG.getConstant(3, DL, MVT::i32));

29208

29209

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

29210

29211

return DAG.getMergeValues({RetVal, Chain}, DL);

29212

}

29213

29214

SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

29215

SelectionDAG &DAG) const {

29216

MachineFunction &MF = DAG.getMachineFunction();

29217

SDLoc DL(Op);

29218

SDValue Chain = Op.getNode()->getOperand(0);

29219

29220

// FP control word may be set only from data in memory. So we need to allocate

29221

// stack space to save/load FP control word.

29222

int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

29223

SDValue StackSlot =

29224

DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

29225

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

29226

MachineMemOperand *MMO =

29227

MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));

29228

29229

// Store FP control word into memory.

29230

SDValue Ops[] = {Chain, StackSlot};

29231

Chain = DAG.getMemIntrinsicNode(

29232

X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);

29233

29234

// Load FP Control Word from stack slot and clear RM field (bits 11:10).

29235

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

29236

Chain = CWD.getValue(1);

29237

CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

29238

DAG.getConstant(0xf3ff, DL, MVT::i16));

29239

29240

// Calculate new rounding mode.

29241

SDValue NewRM = Op.getNode()->getOperand(1);

29242

SDValue RMBits;

29243

if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

29244

uint64_t RM = CVal->getZExtValue();

29245

int FieldVal;

29246

switch (static_cast<RoundingMode>(RM)) {

29247

case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;

29248

case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;

29249

case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;

29250

case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;

29251

default:

29252

llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29252);

29253

}

29254

RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

29255

} else {

29256

// Need to convert argument into bits of control word:

29257

// 0 Round to 0 -> 11

29258

// 1 Round to nearest -> 00

29259

// 2 Round to +inf -> 10

29260

// 3 Round to -inf -> 01

29261

// The 2-bit value needs then to be shifted so that it occupies bits 11:10.

29262

// To make the conversion, put all these values into a value 0xc9 and shift

29263

// it left depending on the rounding mode:

29264

// (0xc9 << 4) & 0xc00 = X86::rmTowardZero

29265

// (0xc9 << 6) & 0xc00 = X86::rmToNearest

29266

// ...

29267

// (0xc9 << (2 * NewRM + 4)) & 0xc00

29268

SDValue ShiftValue =

29269

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

29270

DAG.getNode(ISD::ADD, DL, MVT::i32,

29271

DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

29272

DAG.getConstant(1, DL, MVT::i8)),

29273

DAG.getConstant(4, DL, MVT::i32)));

29274

SDValue Shifted =

29275

DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

29276

ShiftValue);

29277

RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

29278

DAG.getConstant(0xc00, DL, MVT::i16));

29279

}

29280

29281

// Update rounding mode bits and store the new FP Control Word into stack.

29282

CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

29283

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));

29284

29285

// Load FP control word from the slot.

29286

SDValue OpsLD[] = {Chain, StackSlot};

29287

MachineMemOperand *MMOL =

29288

MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

29289

Chain = DAG.getMemIntrinsicNode(

29290

X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);

29291

29292

// If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

29293

// same way but in bits 14:13.

29294

if (Subtarget.hasSSE1()) {

29295

// Store MXCSR into memory.

29296

Chain = DAG.getNode(

29297

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29298

DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

29299

StackSlot);

29300

29301

// Load MXCSR from stack slot and clear RM field (bits 14:13).

29302

SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

29303

Chain = CWD.getValue(1);

29304

CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

29305

DAG.getConstant(0xffff9fff, DL, MVT::i32));

29306

29307

// Shift X87 RM bits from 11:10 to 14:13.

29308

RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

29309

RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

29310

DAG.getConstant(3, DL, MVT::i8));

29311

29312

// Update rounding mode bits and store the new FP Control Word into stack.

29313

CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

29314

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));

29315

29316

// Load MXCSR from the slot.

29317

Chain = DAG.getNode(

29318

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29319

DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

29320

StackSlot);

29321

}

29322

29323

return Chain;

29324

}

29325

29326

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

29327

//

29328

// i8/i16 vector implemented using dword LZCNT vector instruction

29329

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

29330

// split the vector, perform operation on it's Lo a Hi part and

29331

// concatenate the results.

29332

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

29333

const X86Subtarget &Subtarget) {

29334

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29334, __extension__ __PRETTY_FUNCTION__));

29335

SDLoc dl(Op);

29336

MVT VT = Op.getSimpleValueType();

29337

MVT EltVT = VT.getVectorElementType();

29338

unsigned NumElems = VT.getVectorNumElements();

29339

29340

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29341, __extension__
__PRETTY_FUNCTION__))

29341

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29341, __extension__
__PRETTY_FUNCTION__));

29342

29343

// Split vector, it's Lo and Hi parts will be handled in next iteration.

29344

if (NumElems > 16 ||

29345

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

29346

return splitVectorIntUnary(Op, DAG);

29347

29348

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

29349

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29350, __extension__
__PRETTY_FUNCTION__))

29350

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29350, __extension__
__PRETTY_FUNCTION__));

29351

29352

// Use native supported vector instruction vplzcntd.

29353

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

29354

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

29355

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

29356

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

29357

29358

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

29359

}

29360

29361

// Lower CTLZ using a PSHUFB lookup table implementation.

29362

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

29363

const X86Subtarget &Subtarget,

29364

SelectionDAG &DAG) {

29365

MVT VT = Op.getSimpleValueType();

29366

int NumElts = VT.getVectorNumElements();

29367

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

29368

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

29369

29370

// Per-nibble leading zero PSHUFB lookup table.

29371

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

29372

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

29373

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

29374

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

29375

29376

SmallVector<SDValue, 64> LUTVec;

29377

for (int i = 0; i < NumBytes; ++i)

29378

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

29379

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

29380

29381

// Begin by bitcasting the input to byte vector, then split those bytes

29382

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

29383

// If the hi input nibble is zero then we add both results together, otherwise

29384

// we just take the hi result (by masking the lo result to zero before the

29385

// add).

29386

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

29387

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

29388

29389

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

29390

SDValue Lo = Op0;

29391

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

29392

SDValue HiZ;

29393

if (CurrVT.is512BitVector()) {

29394

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29395

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

29396

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29397

} else {

29398

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

29399

}

29400

29401

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

29402

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

29403

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

29404

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

29405

29406

// Merge result back from vXi8 back to VT, working on the lo/hi halves

29407

// of the current vector width in the same way we did for the nibbles.

29408

// If the upper half of the input element is zero then add the halves'

29409

// leading zero counts together, otherwise just use the upper half's.

29410

// Double the width of the result until we are at target width.

29411

while (CurrVT != VT) {

29412

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

29413

int CurrNumElts = CurrVT.getVectorNumElements();

29414

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

29415

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

29416

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

29417

29418

// Check if the upper half of the input element is zero.

29419

if (CurrVT.is512BitVector()) {

29420

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29421

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

29422

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29423

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29424

} else {

29425

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

29426

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29427

}

29428

HiZ = DAG.getBitcast(NextVT, HiZ);

29429

29430

// Move the upper/lower halves to the lower bits as we'll be extending to

29431

// NextVT. Mask the lower result to zero if HiZ is true and add the results

29432

// together.

29433

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

29434

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

29435

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

29436

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

29437

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

29438

CurrVT = NextVT;

29439

}

29440

29441

return Res;

29442

}

29443

29444

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

29445

const X86Subtarget &Subtarget,

29446

SelectionDAG &DAG) {

29447

MVT VT = Op.getSimpleValueType();

29448

29449

if (Subtarget.hasCDI() &&

29450

// vXi8 vectors need to be promoted to 512-bits for vXi32.

29451

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

29452

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

29453

29454

// Decompose 256-bit ops into smaller 128-bit ops.

29455

if (VT.is256BitVector() && !Subtarget.hasInt256())

29456

return splitVectorIntUnary(Op, DAG);

29457

29458

// Decompose 512-bit ops into smaller 256-bit ops.

29459

if (VT.is512BitVector() && !Subtarget.hasBWI())

29460

return splitVectorIntUnary(Op, DAG);

29461

29462

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29462, __extension__
__PRETTY_FUNCTION__));

29463

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

29464

}

29465

29466

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

29467

SelectionDAG &DAG) {

29468

MVT VT = Op.getSimpleValueType();

29469

MVT OpVT = VT;

29470

unsigned NumBits = VT.getSizeInBits();

29471

SDLoc dl(Op);

29472

unsigned Opc = Op.getOpcode();

29473

29474

if (VT.isVector())

29475

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

29476

29477

Op = Op.getOperand(0);

29478

if (VT == MVT::i8) {

29479

// Zero extend to i32 since there is not an i8 bsr.

29480

OpVT = MVT::i32;

29481

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

29482

}

29483

29484

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

29485

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

29486

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

29487

29488

if (Opc == ISD::CTLZ) {

29489

// If src is zero (i.e. bsr sets ZF), returns NumBits.

29490

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

29491

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29492

Op.getValue(1)};

29493

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

29494

}

29495

29496

// Finally xor with NumBits-1.

29497

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

29498

DAG.getConstant(NumBits - 1, dl, OpVT));

29499

29500

if (VT == MVT::i8)

29501

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

29502

return Op;

29503

}

29504

29505

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

29506

SelectionDAG &DAG) {

29507

MVT VT = Op.getSimpleValueType();

29508

unsigned NumBits = VT.getScalarSizeInBits();

29509

SDValue N0 = Op.getOperand(0);

29510

SDLoc dl(Op);

29511

29512

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29513, __extension__
__PRETTY_FUNCTION__))

29513

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29513, __extension__
__PRETTY_FUNCTION__));

29514

29515

// Issue a bsf (scan bits forward) which also sets EFLAGS.

29516

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

29517

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

29518

29519

// If src is known never zero we can skip the CMOV.

29520

if (DAG.isKnownNeverZero(N0))

29521

return Op;

29522

29523

// If src is zero (i.e. bsf sets ZF), returns NumBits.

29524

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

29525

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29526

Op.getValue(1)};

29527

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

29528

}

29529

29530

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

29531

const X86Subtarget &Subtarget) {

29532

MVT VT = Op.getSimpleValueType();

29533

if (VT == MVT::i16 || VT == MVT::i32)

29534

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

29535

29536

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29537

return splitVectorIntBinary(Op, DAG);

29538

29539

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29541, __extension__
__PRETTY_FUNCTION__))

29540

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29541, __extension__
__PRETTY_FUNCTION__))

29541

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29541, __extension__
__PRETTY_FUNCTION__));

29542

return splitVectorIntBinary(Op, DAG);

29543

}

29544

29545

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

29546

const X86Subtarget &Subtarget) {

29547

MVT VT = Op.getSimpleValueType();

29548

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

29549

unsigned Opcode = Op.getOpcode();

29550

SDLoc DL(Op);

29551

29552

if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

29553

(VT.is256BitVector() && !Subtarget.hasInt256())) {

29554

assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29555, __extension__
__PRETTY_FUNCTION__))

29555

"Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29555, __extension__
__PRETTY_FUNCTION__));

29556

return splitVectorIntBinary(Op, DAG);

29557

}

29558

29559

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

29560

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29561

EVT SetCCResultType =

29562

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

29563

29564

unsigned BitWidth = VT.getScalarSizeInBits();

29565

if (Opcode == ISD::USUBSAT) {

29566

if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

29567

// Handle a special-case with a bit-hack instead of cmp+select:

29568

// usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

29569

// If the target can use VPTERNLOG, DAGToDAG will match this as

29570

// "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

29571

// "broadcast" constant load.

29572

ConstantSDNode *C = isConstOrConstSplat(Y, true);

29573

if (C && C->getAPIntValue().isSignMask()) {

29574

SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

29575

SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

29576

SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

29577

SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

29578

return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

29579

}

29580

}

29581

if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

29582

// usubsat X, Y --> (X >u Y) ? X - Y : 0

29583

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

29584

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

29585

// TODO: Move this to DAGCombiner?

29586

if (SetCCResultType == VT &&

29587

DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

29588

return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

29589

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

29590

}

29591

}

29592

29593

if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

29594

(!VT.isVector() || VT == MVT::v2i64)) {

29595

APInt MinVal = APInt::getSignedMinValue(BitWidth);

29596

APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

29597

SDValue Zero = DAG.getConstant(0, DL, VT);

29598

SDValue Result =

29599

DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

29600

DAG.getVTList(VT, SetCCResultType), X, Y);

29601

SDValue SumDiff = Result.getValue(0);

29602

SDValue Overflow = Result.getValue(1);

29603

SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

29604

SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

29605

SDValue SumNeg =

29606

DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

29607

Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

29608

return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

29609

}

29610

29611

// Use default expansion.

29612

return SDValue();

29613

}

29614

29615

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

29616

SelectionDAG &DAG) {

29617

MVT VT = Op.getSimpleValueType();

29618

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

29619

// Since X86 does not have CMOV for 8-bit integer, we don't convert

29620

// 8-bit integer abs to NEG and CMOV.

29621

SDLoc DL(Op);

29622

SDValue N0 = Op.getOperand(0);

29623

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

29624

DAG.getConstant(0, DL, VT), N0);

29625

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

29626

SDValue(Neg.getNode(), 1)};

29627

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

29628

}

29629

29630

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

29631

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

29632

SDLoc DL(Op);

29633

SDValue Src = Op.getOperand(0);

29634

SDValue Sub =

29635

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

29636

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

29637

}

29638

29639

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

29640

assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29641, __extension__
__PRETTY_FUNCTION__))

29641

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29641, __extension__
__PRETTY_FUNCTION__));

29642

return splitVectorIntUnary(Op, DAG);

29643

}

29644

29645

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

29646

return splitVectorIntUnary(Op, DAG);

29647

29648

// Default to expand.

29649

return SDValue();

29650

}

29651

29652

static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

29653

SelectionDAG &DAG) {

29654

MVT VT = Op.getSimpleValueType();

29655

29656

// For AVX1 cases, split to use legal ops.

29657

if (VT.is256BitVector() && !Subtarget.hasInt256())

29658

return splitVectorIntBinary(Op, DAG);

29659

29660

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29661

return splitVectorIntBinary(Op, DAG);

29662

29663

// Default to expand.

29664

return SDValue();

29665

}

29666

29667

static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,

29668

SelectionDAG &DAG) {

29669

MVT VT = Op.getSimpleValueType();

29670

29671

// For AVX1 cases, split to use legal ops.

29672

if (VT.is256BitVector() && !Subtarget.hasInt256())

29673

return splitVectorIntBinary(Op, DAG);

29674

29675

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29676

return splitVectorIntBinary(Op, DAG);

29677

29678

// Default to expand.

29679

return SDValue();

29680

}

29681

29682

static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,

29683

SelectionDAG &DAG) {

29684

MVT VT = Op.getSimpleValueType();

29685

29686

// For AVX1 cases, split to use legal ops.

29687

if (VT.is256BitVector() && !Subtarget.hasInt256())

29688

return splitVectorIntBinary(Op, DAG);

29689

29690

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())

29691

return splitVectorIntBinary(Op, DAG);

29692

29693

// TODO: Add TargetLowering expandABD() support.

29694

SDLoc dl(Op);

29695

bool IsSigned = Op.getOpcode() == ISD::ABDS;

29696

SDValue LHS = DAG.getFreeze(Op.getOperand(0));

29697

SDValue RHS = DAG.getFreeze(Op.getOperand(1));

29698

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29699

29700

// abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))

29701

// abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))

29702

unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;

29703

unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;

29704

if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {

29705

SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);

29706

SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);

29707

return DAG.getNode(ISD::SUB, dl, VT, Max, Min);

29708

}

29709

29710

// abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))

29711

// abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))

29712

EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

29713

ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;

29714

SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);

29715

return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),

29716

DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));

29717

}

29718

29719

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

29720

SelectionDAG &DAG) {

29721

SDLoc dl(Op);

29722

MVT VT = Op.getSimpleValueType();

29723

29724

// Decompose 256-bit ops into 128-bit ops.

29725

if (VT.is256BitVector() && !Subtarget.hasInt256())

29726

return splitVectorIntBinary(Op, DAG);

29727

29728

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

29729

return splitVectorIntBinary(Op, DAG);

29730

29731

SDValue A = Op.getOperand(0);

29732

SDValue B = Op.getOperand(1);

29733

29734

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

29735

// vector pairs, multiply and truncate.

29736

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

29737

unsigned NumElts = VT.getVectorNumElements();

29738

29739

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

29740

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

29741

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

29742

return DAG.getNode(

29743

ISD::TRUNCATE, dl, VT,

29744

DAG.getNode(ISD::MUL, dl, ExVT,

29745

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

29746

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

29747

}

29748

29749

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

29750

29751

// Extract the lo/hi parts to any extend to i16.

29752

// We're going to mask off the low byte of each result element of the

29753

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

29754

// element.

29755

SDValue Undef = DAG.getUNDEF(VT);

29756

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

29757

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

29758

29759

SDValue BLo, BHi;

29760

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

29761

// If the RHS is a constant, manually unpackl/unpackh.

29762

SmallVector<SDValue, 16> LoOps, HiOps;

29763

for (unsigned i = 0; i != NumElts; i += 16) {

29764

for (unsigned j = 0; j != 8; ++j) {

29765

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

29766

MVT::i16));

29767

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

29768

MVT::i16));

29769

}

29770

}

29771

29772

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

29773

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

29774

} else {

29775

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

29776

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

29777

}

29778

29779

// Multiply, mask the lower 8bits of the lo/hi results and pack.

29780

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

29781

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

29782

return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

29783

}

29784

29785

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

29786

if (VT == MVT::v4i32) {

29787

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29788, __extension__
__PRETTY_FUNCTION__))

29788

"Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29788, __extension__
__PRETTY_FUNCTION__));

29789

29790

// Extract the odd parts.

29791

static const int UnpackMask[] = { 1, -1, 3, -1 };

29792

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

29793

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

29794

29795

// Multiply the even parts.

29796

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

29797

DAG.getBitcast(MVT::v2i64, A),

29798

DAG.getBitcast(MVT::v2i64, B));

29799

// Now multiply odd parts.

29800

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

29801

DAG.getBitcast(MVT::v2i64, Aodds),

29802

DAG.getBitcast(MVT::v2i64, Bodds));

29803

29804

Evens = DAG.getBitcast(VT, Evens);

29805

Odds = DAG.getBitcast(VT, Odds);

29806

29807

// Merge the two vectors back together with a shuffle. This expands into 2

29808

// shuffles.

29809

static const int ShufMask[] = { 0, 4, 2, 6 };

29810

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

29811

}

29812

29813

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29814, __extension__
__PRETTY_FUNCTION__))

29814

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29814, __extension__
__PRETTY_FUNCTION__));

29815

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29815, __extension__
__PRETTY_FUNCTION__));

29816

29817

// Ahi = psrlqi(a, 32);

29818

// Bhi = psrlqi(b, 32);

29819

//

29820

// AloBlo = pmuludq(a, b);

29821

// AloBhi = pmuludq(a, Bhi);

29822

// AhiBlo = pmuludq(Ahi, b);

29823

//

29824

// Hi = psllqi(AloBhi + AhiBlo, 32);

29825

// return AloBlo + Hi;

29826

KnownBits AKnown = DAG.computeKnownBits(A);

29827

KnownBits BKnown = DAG.computeKnownBits(B);

29828

29829

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

29830

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

29831

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

29832

29833

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

29834

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

29835

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

29836

29837

SDValue Zero = DAG.getConstant(0, dl, VT);

29838

29839

// Only multiply lo/hi halves that aren't known to be zero.

29840

SDValue AloBlo = Zero;

29841

if (!ALoIsZero && !BLoIsZero)

29842

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

29843

29844

SDValue AloBhi = Zero;

29845

if (!ALoIsZero && !BHiIsZero) {

29846

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

29847

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

29848

}

29849

29850

SDValue AhiBlo = Zero;

29851

if (!AHiIsZero && !BLoIsZero) {

29852

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

29853

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

29854

}

29855

29856

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

29857

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

29858

29859

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

29860

}

29861

29862

static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

29863

MVT VT, bool IsSigned,

29864

const X86Subtarget &Subtarget,

29865

SelectionDAG &DAG,

29866

SDValue *Low = nullptr) {

29867

unsigned NumElts = VT.getVectorNumElements();

29868

29869

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

29870

// to a vXi16 type. Do the multiplies, shift the results and pack the half

29871

// lane results back together.

29872

29873

// We'll take different approaches for signed and unsigned.

29874

// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes

29875

// and use pmullw to calculate the full 16-bit product.

29876

// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

29877

// shift them left into the upper byte of each word. This allows us to use

29878

// pmulhw to calculate the full 16-bit product. This trick means we don't

29879

// need to sign extend the bytes to use pmullw.

29880

29881

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

29882

SDValue Zero = DAG.getConstant(0, dl, VT);

29883

29884

SDValue ALo, AHi;

29885

if (IsSigned) {

29886

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

29887

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

29888

} else {

29889

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

29890

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

29891

}

29892

29893

SDValue BLo, BHi;

29894

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

29895

// If the RHS is a constant, manually unpackl/unpackh and extend.

29896

SmallVector<SDValue, 16> LoOps, HiOps;

29897

for (unsigned i = 0; i != NumElts; i += 16) {

29898

for (unsigned j = 0; j != 8; ++j) {

29899

SDValue LoOp = B.getOperand(i + j);

29900

SDValue HiOp = B.getOperand(i + j + 8);

29901

29902

if (IsSigned) {

29903

LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);

29904

HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);

29905

LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,

29906

DAG.getConstant(8, dl, MVT::i16));

29907

HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,

29908

DAG.getConstant(8, dl, MVT::i16));

29909

} else {

29910

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

29911

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

29912

}

29913

29914

LoOps.push_back(LoOp);

29915

HiOps.push_back(HiOp);

29916

}

29917

}

29918

29919

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

29920

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

29921

} else if (IsSigned) {

29922

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

29923

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

29924

} else {

29925

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

29926

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

29927

}

29928

29929

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

29930

// pack back to vXi8.

29931

unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

29932

SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

29933

SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);

29934

29935

if (Low)

29936

*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

29937

29938

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);

29939

}

29940

29941

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

29942

SelectionDAG &DAG) {

29943

SDLoc dl(Op);

29944

MVT VT = Op.getSimpleValueType();

29945

bool IsSigned = Op->getOpcode() == ISD::MULHS;

29946

unsigned NumElts = VT.getVectorNumElements();

29947

SDValue A = Op.getOperand(0);

29948

SDValue B = Op.getOperand(1);

29949

29950

// Decompose 256-bit ops into 128-bit ops.

29951

if (VT.is256BitVector() && !Subtarget.hasInt256())

29952

return splitVectorIntBinary(Op, DAG);

29953

29954

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

29955

return splitVectorIntBinary(Op, DAG);

29956

29957

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

29958

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29960, __extension__
__PRETTY_FUNCTION__))

29959

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29960, __extension__
__PRETTY_FUNCTION__))

29960

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29960, __extension__
__PRETTY_FUNCTION__));

29961

29962

// PMULxD operations multiply each even value (starting at 0) of LHS with

29963

// the related value of RHS and produce a widen result.

29964

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

29965

// => <2 x i64> <ae|cg>

29966

//

29967

// In other word, to have all the results, we need to perform two PMULxD:

29968

// 1. one with the even values.

29969

// 2. one with the odd values.

29970

// To achieve #2, with need to place the odd values at an even position.

29971

//

29972

// Place the odd value at an even position (basically, shift all values 1

29973

// step to the left):

29974

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

29975

9, -1, 11, -1, 13, -1, 15, -1};

29976

// <a|b|c|d> => <b|undef|d|undef>

29977

SDValue Odd0 =

29978

DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));

29979

// <e|f|g|h> => <f|undef|h|undef>

29980

SDValue Odd1 =

29981

DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));

29982

29983

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

29984

// ints.

29985

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

29986

unsigned Opcode =

29987

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

29988

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

29989

// => <2 x i64> <ae|cg>

29990

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

29991

DAG.getBitcast(MulVT, A),

29992

DAG.getBitcast(MulVT, B)));

29993

29994

// => <2 x i64> <bf|dh>

29995

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

29996

DAG.getBitcast(MulVT, Odd0),

29997

DAG.getBitcast(MulVT, Odd1)));

29998

29999

// Shuffle it back into the right order.

30000

SmallVector<int, 16> ShufMask(NumElts);

30001

for (int i = 0; i != (int)NumElts; ++i)

30002

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

30003

30004

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

30005

30006

// If we have a signed multiply but no PMULDQ fix up the result of an

30007

// unsigned multiply.

30008

if (IsSigned && !Subtarget.hasSSE41()) {

30009

SDValue Zero = DAG.getConstant(0, dl, VT);

30010

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

30011

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

30012

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

30013

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

30014

30015

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

30016

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

30017

}

30018

30019

return Res;

30020

}

30021

30022

// Only i8 vectors should need custom lowering after this.

30023

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30025, __extension__
__PRETTY_FUNCTION__))

30024

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30025, __extension__
__PRETTY_FUNCTION__))

30025

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30025, __extension__
__PRETTY_FUNCTION__));

30026

30027

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

30028

// logical shift down the upper half and pack back to i8.

30029

30030

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

30031

// and then ashr/lshr the upper bits down to the lower bits before multiply.

30032

30033

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30034

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30035

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30036

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30037

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30038

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30039

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30040

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30041

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30042

}

30043

30044

return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

30045

}

30046

30047

// Custom lowering for SMULO/UMULO.

30048

static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

30049

SelectionDAG &DAG) {

30050

MVT VT = Op.getSimpleValueType();

30051

30052

// Scalars defer to LowerXALUO.

30053

if (!VT.isVector())

30054

return LowerXALUO(Op, DAG);

30055

30056

SDLoc dl(Op);

30057

bool IsSigned = Op->getOpcode() == ISD::SMULO;

30058

SDValue A = Op.getOperand(0);

30059

SDValue B = Op.getOperand(1);

30060

EVT OvfVT = Op->getValueType(1);

30061

30062

if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

30063

(VT == MVT::v64i8 && !Subtarget.hasBWI())) {

30064

// Extract the LHS Lo/Hi vectors

30065

SDValue LHSLo, LHSHi;

30066

std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);

30067

30068

// Extract the RHS Lo/Hi vectors

30069

SDValue RHSLo, RHSHi;

30070

std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);

30071

30072

EVT LoOvfVT, HiOvfVT;

30073

std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

30074

SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

30075

SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);

30076

30077

// Issue the split operations.

30078

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

30079

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);

30080

30081

// Join the separate data results and the overflow results.

30082

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

30083

SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

30084

Hi.getValue(1));

30085

30086

return DAG.getMergeValues({Res, Ovf}, dl);

30087

}

30088

30089

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30090

EVT SetccVT =

30091

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30092

30093

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30094

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30095

unsigned NumElts = VT.getVectorNumElements();

30096

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30097

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30098

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30099

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30100

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30101

30102

SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30103

30104

SDValue Ovf;

30105

if (IsSigned) {

30106

SDValue High, LowSign;

30107

if (OvfVT.getVectorElementType() == MVT::i1 &&

30108

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30109

// Rather the truncating try to do the compare on vXi16 or vXi32.

30110

// Shift the high down filling with sign bits.

30111

High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

30112

// Fill all 16 bits with the sign bit from the low.

30113

LowSign =

30114

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

30115

LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

30116

15, DAG);

30117

SetccVT = OvfVT;

30118

if (!Subtarget.hasBWI()) {

30119

// We can't do a vXi16 compare so sign extend to v16i32.

30120

High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

30121

LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

30122

}

30123

} else {

30124

// Otherwise do the compare at vXi8.

30125

High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30126

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30127

LowSign =

30128

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30129

}

30130

30131

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30132

} else {

30133

SDValue High =

30134

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30135

if (OvfVT.getVectorElementType() == MVT::i1 &&

30136

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30137

// Rather the truncating try to do the compare on vXi16 or vXi32.

30138

SetccVT = OvfVT;

30139

if (!Subtarget.hasBWI()) {

30140

// We can't do a vXi16 compare so sign extend to v16i32.

30141

High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

30142

}

30143

} else {

30144

// Otherwise do the compare at vXi8.

30145

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30146

}

30147

30148

Ovf =

30149

DAG.getSetCC(dl, SetccVT, High,

30150

DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

30151

}

30152

30153

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30154

30155

return DAG.getMergeValues({Low, Ovf}, dl);

30156

}

30157

30158

SDValue Low;

30159

SDValue High =

30160

LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);

30161

30162

SDValue Ovf;

30163

if (IsSigned) {

30164

// SMULO overflows if the high bits don't match the sign of the low.

30165

SDValue LowSign =

30166

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30167

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30168

} else {

30169

// UMULO overflows if the high bits are non-zero.

30170

Ovf =

30171

DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

30172

}

30173

30174

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30175

30176

return DAG.getMergeValues({Low, Ovf}, dl);

30177

}

30178

30179

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

30180

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30180, __extension__
__PRETTY_FUNCTION__));

30181

EVT VT = Op.getValueType();

30182

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30183, __extension__
__PRETTY_FUNCTION__))

30183

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30183, __extension__
__PRETTY_FUNCTION__));

30184

30185

if (isa<ConstantSDNode>(Op->getOperand(1))) {

30186

SmallVector<SDValue> Result;

30187

if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))

30188

return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);

30189

}

30190

30191

RTLIB::Libcall LC;

30192

bool isSigned;

30193

switch (Op->getOpcode()) {

30194

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30194);

30195

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

30196

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

30197

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

30198

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

30199

}

30200

30201

SDLoc dl(Op);

30202

SDValue InChain = DAG.getEntryNode();

30203

30204

TargetLowering::ArgListTy Args;

30205

TargetLowering::ArgListEntry Entry;

30206

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

30207

EVT ArgVT = Op->getOperand(i).getValueType();

30208

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30209, __extension__
__PRETTY_FUNCTION__))

30209

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30209, __extension__
__PRETTY_FUNCTION__));

30210

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30211

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30212

MachinePointerInfo MPI =

30213

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30214

Entry.Node = StackPtr;

30215

InChain =

30216

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

30217

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

30218

Entry.Ty = PointerType::get(ArgTy,0);

30219

Entry.IsSExt = false;

30220

Entry.IsZExt = false;

30221

Args.push_back(Entry);

30222

}

30223

30224

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

30225

getPointerTy(DAG.getDataLayout()));

30226

30227

TargetLowering::CallLoweringInfo CLI(DAG);

30228

CLI.setDebugLoc(dl)

30229

.setChain(InChain)

30230

.setLibCallee(

30231

getLibcallCallingConv(LC),

30232

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

30233

std::move(Args))

30234

.setInRegister()

30235

.setSExtResult(isSigned)

30236

.setZExtResult(!isSigned);

30237

30238

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

30239

return DAG.getBitcast(VT, CallInfo.first);

30240

}

30241

30242

SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

30243

SelectionDAG &DAG,

30244

SDValue &Chain) const {

30245

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30245, __extension__
__PRETTY_FUNCTION__));

30246

EVT VT = Op.getValueType();

30247

bool IsStrict = Op->isStrictFPOpcode();

30248

30249

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30250

EVT ArgVT = Arg.getValueType();

30251

30252

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30253, __extension__
__PRETTY_FUNCTION__))

30253

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30253, __extension__
__PRETTY_FUNCTION__));

30254

30255

RTLIB::Libcall LC;

30256

if (Op->getOpcode() == ISD::FP_TO_SINT ||

30257

Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

30258

LC = RTLIB::getFPTOSINT(ArgVT, VT);

30259

else

30260

LC = RTLIB::getFPTOUINT(ArgVT, VT);

30261

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30261, __extension__
__PRETTY_FUNCTION__));

30262

30263

SDLoc dl(Op);

30264

MakeLibCallOptions CallOptions;

30265

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30266

30267

SDValue Result;

30268

// Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

30269

// expected VT (i128).

30270

std::tie(Result, Chain) =

30271

makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

30272

Result = DAG.getBitcast(VT, Result);

30273

return Result;

30274

}

30275

30276

SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

30277

SelectionDAG &DAG) const {

30278

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30278, __extension__
__PRETTY_FUNCTION__));

30279

EVT VT = Op.getValueType();

30280

bool IsStrict = Op->isStrictFPOpcode();

30281

30282

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30283

EVT ArgVT = Arg.getValueType();

30284

30285

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30286, __extension__
__PRETTY_FUNCTION__))

30286

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30286, __extension__
__PRETTY_FUNCTION__));

30287

30288

RTLIB::Libcall LC;

30289

if (Op->getOpcode() == ISD::SINT_TO_FP ||

30290

Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

30291

LC = RTLIB::getSINTTOFP(ArgVT, VT);

30292

else

30293

LC = RTLIB::getUINTTOFP(ArgVT, VT);

30294

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30294, __extension__
__PRETTY_FUNCTION__));

30295

30296

SDLoc dl(Op);

30297

MakeLibCallOptions CallOptions;

30298

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30299

30300

// Pass the i128 argument as an indirect argument on the stack.

30301

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30302

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30303

MachinePointerInfo MPI =

30304

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30305

Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));

30306

30307

SDValue Result;

30308

std::tie(Result, Chain) =

30309

makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

30310

return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

30311

}

30312

30313

// Return true if the required (according to Opcode) shift-imm form is natively

30314

// supported by the Subtarget

30315

static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

30316

unsigned Opcode) {

30317

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30318

return false;

30319

30320

if (VT.getScalarSizeInBits() < 16)

30321

return false;

30322

30323

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

30324

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

30325

return true;

30326

30327

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

30328

(VT.is256BitVector() && Subtarget.hasInt256());

30329

30330

bool AShift = LShift && (Subtarget.hasAVX512() ||

30331

(VT != MVT::v2i64 && VT != MVT::v4i64));

30332

return (Opcode == ISD::SRA) ? AShift : LShift;

30333

}

30334

30335

// The shift amount is a variable, but it is the same for all vector lanes.

30336

// These instructions are defined together with shift-immediate.

30337

static

30338

bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

30339

unsigned Opcode) {

30340

return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

30341

}

30342

30343

// Return true if the required (according to Opcode) variable-shift form is

30344

// natively supported by the Subtarget

30345

static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

30346

unsigned Opcode) {

30347

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30348

return false;

30349

30350

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

30351

return false;

30352

30353

// vXi16 supported only on AVX-512, BWI

30354

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

30355

return false;

30356

30357

if (Subtarget.hasAVX512() &&

30358

(Subtarget.useAVX512Regs() || !VT.is512BitVector()))

30359

return true;

30360

30361

bool LShift = VT.is128BitVector() || VT.is256BitVector();

30362

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

30363

return (Opcode == ISD::SRA) ? AShift : LShift;

30364

}

30365

30366

static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

30367

const X86Subtarget &Subtarget) {

30368

MVT VT = Op.getSimpleValueType();

30369

SDLoc dl(Op);

30370

SDValue R = Op.getOperand(0);

30371

SDValue Amt = Op.getOperand(1);

30372

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

30373

30374

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

30375

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30375, __extension__
__PRETTY_FUNCTION__));

30376

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

30377

SDValue Ex = DAG.getBitcast(ExVT, R);

30378

30379

// ashr(R, 63) === cmp_slt(R, 0)

30380

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

30381

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30382, __extension__
__PRETTY_FUNCTION__))

30382

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30382, __extension__
__PRETTY_FUNCTION__));

30383

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

30384

}

30385

30386

if (ShiftAmt >= 32) {

30387

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

30388

SDValue Upper =

30389

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

30390

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30391

ShiftAmt - 32, DAG);

30392

if (VT == MVT::v2i64)

30393

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

30394

if (VT == MVT::v4i64)

30395

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30396

{9, 1, 11, 3, 13, 5, 15, 7});

30397

} else {

30398

// SRA upper i32, SRL whole i64 and select lower i32.

30399

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30400

ShiftAmt, DAG);

30401

SDValue Lower =

30402

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

30403

Lower = DAG.getBitcast(ExVT, Lower);

30404

if (VT == MVT::v2i64)

30405

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

30406

if (VT == MVT::v4i64)

30407

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30408

{8, 1, 10, 3, 12, 5, 14, 7});

30409

}

30410

return DAG.getBitcast(VT, Ex);

30411

};

30412

30413

// Optimize shl/srl/sra with constant shift amount.

30414

APInt APIntShiftAmt;

30415

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

30416

return SDValue();

30417

30418

// If the shift amount is out of range, return undef.

30419

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

30420

return DAG.getUNDEF(VT);

30421

30422

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

30423

30424

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {

30425

// Hardware support for vector shifts is sparse which makes us scalarize the

30426

// vector operations in many cases. Also, on sandybridge ADD is faster than

30427

// shl: (shl V, 1) -> (add (freeze V), (freeze V))

30428

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30429

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30430

// must be 0). (add undef, undef) however can be any value. To make this

30431

// safe, we must freeze R to ensure that register allocation uses the same

30432

// register for an undefined value. This ensures that the result will

30433

// still be even and preserves the original semantics.

30434

R = DAG.getFreeze(R);

30435

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30436

}

30437

30438

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

30439

}

30440

30441

// i64 SRA needs to be performed as partial shifts.

30442

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

30443

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

30444

Op.getOpcode() == ISD::SRA)

30445

return ArithmeticShiftRight64(ShiftAmt);

30446

30447

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

30448

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

30449

unsigned NumElts = VT.getVectorNumElements();

30450

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30451

30452

// Simple i8 add case

30453

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30454

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30455

// must be 0). (add undef, undef) however can be any value. To make this

30456

// safe, we must freeze R to ensure that register allocation uses the same

30457

// register for an undefined value. This ensures that the result will

30458

// still be even and preserves the original semantics.

30459

R = DAG.getFreeze(R);

30460

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30461

}

30462

30463

// ashr(R, 7) === cmp_slt(R, 0)

30464

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

30465

SDValue Zeros = DAG.getConstant(0, dl, VT);

30466

if (VT.is512BitVector()) {

30467

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30467, __extension__
__PRETTY_FUNCTION__));

30468

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

30469

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

30470

}

30471

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

30472

}

30473

30474

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

30475

if (VT == MVT::v16i8 && Subtarget.hasXOP())

30476

return SDValue();

30477

30478

if (Op.getOpcode() == ISD::SHL) {

30479

// Make a large shift.

30480

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

30481

ShiftAmt, DAG);

30482

SHL = DAG.getBitcast(VT, SHL);

30483

// Zero out the rightmost bits.

30484

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

30485

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

30486

}

30487

if (Op.getOpcode() == ISD::SRL) {

30488

// Make a large shift.

30489

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

30490

ShiftAmt, DAG);

30491

SRL = DAG.getBitcast(VT, SRL);

30492

// Zero out the leftmost bits.

30493

APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

30494

return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

30495

}

30496

if (Op.getOpcode() == ISD::SRA) {

30497

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

30498

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

30499

30500

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

30501

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

30502

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

30503

return Res;

30504

}

30505

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30505);

30506

}

30507

30508

return SDValue();

30509

}

30510

30511

static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

30512

const X86Subtarget &Subtarget) {

30513

MVT VT = Op.getSimpleValueType();

30514

SDLoc dl(Op);

30515

SDValue R = Op.getOperand(0);

30516

SDValue Amt = Op.getOperand(1);

30517

unsigned Opcode = Op.getOpcode();

30518

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

30519

30520

int BaseShAmtIdx = -1;

30521

if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

30522

if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

30523

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

30524

Subtarget, DAG);

30525

30526

// vXi8 shifts - shift as v8i16 + mask result.

30527

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

30528

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

30529

VT == MVT::v64i8) &&

30530

!Subtarget.hasXOP()) {

30531

unsigned NumElts = VT.getVectorNumElements();

30532

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30533

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

30534

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

30535

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

30536

30537

// Create the mask using vXi16 shifts. For shift-rights we need to move

30538

// the upper byte down before splatting the vXi8 mask.

30539

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

30540

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

30541

BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

30542

if (Opcode != ISD::SHL)

30543

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

30544

8, DAG);

30545

BitMask = DAG.getBitcast(VT, BitMask);

30546

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

30547

SmallVector<int, 64>(NumElts, 0));

30548

30549

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

30550

DAG.getBitcast(ExtVT, R), BaseShAmt,

30551

BaseShAmtIdx, Subtarget, DAG);

30552

Res = DAG.getBitcast(VT, Res);

30553

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

30554

30555

if (Opcode == ISD::SRA) {

30556

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

30557

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

30558

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

30559

SignMask =

30560

getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

30561

BaseShAmtIdx, Subtarget, DAG);

30562

SignMask = DAG.getBitcast(VT, SignMask);

30563

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

30564

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

30565

}

30566

return Res;

30567

}

30568

}

30569

}

30570

30571

return SDValue();

30572

}

30573

30574

// Convert a shift/rotate left amount to a multiplication scale factor.

30575

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

30576

const X86Subtarget &Subtarget,

30577

SelectionDAG &DAG) {

30578

MVT VT = Amt.getSimpleValueType();

30579

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

30580

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

30581

(Subtarget.hasAVX512() && VT == MVT::v32i16) ||

30582

(!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

30583

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

30584

(Subtarget.hasBWI() && VT == MVT::v64i8)))

30585

return SDValue();

30586

30587

MVT SVT = VT.getVectorElementType();

30588

unsigned SVTBits = SVT.getSizeInBits();

30589

unsigned NumElems = VT.getVectorNumElements();

30590

30591

APInt UndefElts;

30592

SmallVector<APInt> EltBits;

30593

if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

30594

APInt One(SVTBits, 1);

30595

SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

30596

for (unsigned I = 0; I != NumElems; ++I) {

30597

if (UndefElts[I] || EltBits[I].uge(SVTBits))

30598

continue;

30599

uint64_t ShAmt = EltBits[I].getZExtValue();

30600

Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

30601

}

30602

return DAG.getBuildVector(VT, dl, Elts);

30603

}

30604

30605

// If the target doesn't support variable shifts, use either FP conversion

30606

// or integer multiplication to avoid shifting each element individually.

30607

if (VT == MVT::v4i32) {

30608

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

30609

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

30610

DAG.getConstant(0x3f800000U, dl, VT));

30611

Amt = DAG.getBitcast(MVT::v4f32, Amt);

30612

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

30613

}

30614

30615

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

30616

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

30617

SDValue Z = DAG.getConstant(0, dl, VT);

30618

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

30619

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

30620

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

30621

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

30622

if (Subtarget.hasSSE41())

30623

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

30624

return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

30625

}

30626

30627

return SDValue();

30628

}

30629

30630

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

30631

SelectionDAG &DAG) {

30632

MVT VT = Op.getSimpleValueType();

30633

SDLoc dl(Op);

30634

SDValue R = Op.getOperand(0);

30635

SDValue Amt = Op.getOperand(1);

30636

unsigned EltSizeInBits = VT.getScalarSizeInBits();

30637

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

30638

30639

unsigned Opc = Op.getOpcode();

30640

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

30641

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

30642

30643

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30643, __extension__
__PRETTY_FUNCTION__));

30644

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30644, __extension__
__PRETTY_FUNCTION__));

30645

30646

if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

30647

return V;

30648

30649

if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

30650

return V;

30651

30652

if (supportedVectorVarShift(VT, Subtarget, Opc))

30653

return Op;

30654

30655

// i64 vector arithmetic shift can be emulated with the transform:

30656

// M = lshr(SIGN_MASK, Amt)

30657

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

30658

if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

30659

(VT == MVT::v4i64 && Subtarget.hasInt256())) &&

30660

Opc == ISD::SRA) {

30661

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

30662

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

30663

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

30664

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

30665

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

30666

return R;

30667

}

30668

30669

// XOP has 128-bit variable logical/arithmetic shifts.

30670

// +ve/-ve Amt = shift left/right.

30671

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

30672

VT == MVT::v8i16 || VT == MVT::v16i8)) {

30673

if (Opc == ISD::SRL || Opc == ISD::SRA) {

30674

SDValue Zero = DAG.getConstant(0, dl, VT);

30675

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

30676

}

30677

if (Opc == ISD::SHL || Opc == ISD::SRL)

30678

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

30679

if (Opc == ISD::SRA)

30680

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

30681

}

30682

30683

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

30684

// shifts per-lane and then shuffle the partial results back together.

30685

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

30686

// Splat the shift amounts so the scalar shifts above will catch it.

30687

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

30688

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

30689

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

30690

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

30691

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

30692

}

30693

30694

// If possible, lower this shift as a sequence of two shifts by

30695

// constant plus a BLENDing shuffle instead of scalarizing it.

30696

// Example:

30697

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

30698

//

30699

// Could be rewritten as:

30700

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

30701

//

30702

// The advantage is that the two shifts from the example would be

30703

// lowered as X86ISD::VSRLI nodes in parallel before blending.

30704

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

30705

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

30706

SDValue Amt1, Amt2;

30707

unsigned NumElts = VT.getVectorNumElements();

30708

SmallVector<int, 8> ShuffleMask;

30709

for (unsigned i = 0; i != NumElts; ++i) {

30710

SDValue A = Amt->getOperand(i);

30711

if (A.isUndef()) {

30712

ShuffleMask.push_back(SM_SentinelUndef);

30713

continue;

30714

}

30715

if (!Amt1 || Amt1 == A) {

30716

ShuffleMask.push_back(i);

30717

Amt1 = A;

30718

continue;

30719

}

30720

if (!Amt2 || Amt2 == A) {

30721

ShuffleMask.push_back(i + NumElts);

30722

Amt2 = A;

30723

continue;

30724

}

30725

break;

30726

}

30727

30728

// Only perform this blend if we can perform it without loading a mask.

30729

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

30730

(VT != MVT::v16i16 ||

30731

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

30732

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

30733

canWidenShuffleElements(ShuffleMask))) {

30734

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

30735

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

30736

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

30737

Cst2->getAPIntValue().ult(EltSizeInBits)) {

30738

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

30739

Cst1->getZExtValue(), DAG);

30740

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

30741

Cst2->getZExtValue(), DAG);

30742

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

30743

}

30744

}

30745

}

30746

30747

// If possible, lower this packed shift into a vector multiply instead of

30748

// expanding it into a sequence of scalar shifts.

30749

// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

30750

if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

30751

Subtarget.canExtendTo512BW())))

30752

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

30753

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

30754

30755

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

30756

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

30757

if (Opc == ISD::SRL && ConstantAmt &&

30758

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

30759

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

30760

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

30761

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

30762

SDValue Zero = DAG.getConstant(0, dl, VT);

30763

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

30764

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

30765

return DAG.getSelect(dl, VT, ZAmt, R, Res);

30766

}

30767

}

30768

30769

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

30770

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

30771

// TODO: Special case handling for shift by 0/1, really we can afford either

30772

// of these cases in pre-SSE41/XOP/AVX512 but not both.

30773

if (Opc == ISD::SRA && ConstantAmt &&

30774

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

30775

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

30776

!Subtarget.hasAVX512()) ||

30777

DAG.isKnownNeverZero(Amt))) {

30778

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

30779

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

30780

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

30781

SDValue Amt0 =

30782

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

30783

SDValue Amt1 =

30784

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

30785

SDValue Sra1 =

30786

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

30787

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

30788

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

30789

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

30790

}

30791

}

30792

30793

// v4i32 Non Uniform Shifts.

30794

// If the shift amount is constant we can shift each lane using the SSE2

30795

// immediate shifts, else we need to zero-extend each lane to the lower i64

30796

// and shift using the SSE2 variable shifts.

30797

// The separate results can then be blended together.

30798

if (VT == MVT::v4i32) {

30799

SDValue Amt0, Amt1, Amt2, Amt3;

30800

if (ConstantAmt) {

30801

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

30802

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

30803

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

30804

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

30805

} else {

30806

// The SSE2 shifts use the lower i64 as the same shift amount for

30807

// all lanes and the upper i64 is ignored. On AVX we're better off

30808

// just zero-extending, but for SSE just duplicating the top 16-bits is

30809

// cheaper and has the same effect for out of range values.

30810

if (Subtarget.hasAVX()) {

30811

SDValue Z = DAG.getConstant(0, dl, VT);

30812

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

30813

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

30814

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

30815

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

30816

} else {

30817

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

30818

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

30819

{4, 5, 6, 7, -1, -1, -1, -1});

30820

SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);

30821

SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);

30822

Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);

30823

Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);

30824

Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);

30825

Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);

30826

}

30827

}

30828

30829

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

30830

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

30831

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

30832

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

30833

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

30834

30835

// Merge the shifted lane results optimally with/without PBLENDW.

30836

// TODO - ideally shuffle combining would handle this.

30837

if (Subtarget.hasSSE41()) {

30838

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

30839

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

30840

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

30841

}

30842

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

30843

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

30844

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

30845

}

30846

30847

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

30848

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

30849

// make the existing SSE solution better.

30850

// NOTE: We honor prefered vector width before promoting to 512-bits.

30851

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

30852

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

30853

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

30854

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

30855

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

30856

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30857, __extension__
__PRETTY_FUNCTION__))

30857

"Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30857, __extension__
__PRETTY_FUNCTION__));

30858

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

30859

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

30860

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30861

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

30862

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

30863

return DAG.getNode(ISD::TRUNCATE, dl, VT,

30864

DAG.getNode(Opc, dl, ExtVT, R, Amt));

30865

}

30866

30867

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

30868

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

30869

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

30870

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

30871

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

30872

!Subtarget.hasXOP()) {

30873

int NumElts = VT.getVectorNumElements();

30874

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

30875

30876

// Extend constant shift amount to vXi16 (it doesn't matter if the type

30877

// isn't legal).

30878

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30879

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

30880

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

30881

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

30882

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30883, __extension__
__PRETTY_FUNCTION__))

30883

"Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30883, __extension__
__PRETTY_FUNCTION__));

30884

30885

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

30886

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

30887

: DAG.getZExtOrTrunc(R, dl, ExVT);

30888

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

30889

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

30890

return DAG.getZExtOrTrunc(R, dl, VT);

30891

}

30892

30893

SmallVector<SDValue, 16> LoAmt, HiAmt;

30894

for (int i = 0; i != NumElts; i += 16) {

30895

for (int j = 0; j != 8; ++j) {

30896

LoAmt.push_back(Amt.getOperand(i + j));

30897

HiAmt.push_back(Amt.getOperand(i + j + 8));

30898

}

30899

}

30900

30901

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

30902

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

30903

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

30904

30905

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

30906

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

30907

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

30908

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

30909

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

30910

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

30911

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

30912

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

30913

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

30914

}

30915

30916

if (VT == MVT::v16i8 ||

30917

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

30918

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

30919

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

30920

30921

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

30922

if (VT.is512BitVector()) {

30923

// On AVX512BW targets we make use of the fact that VSELECT lowers

30924

// to a masked blend which selects bytes based just on the sign bit

30925

// extracted to a mask.

30926

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

30927

V0 = DAG.getBitcast(VT, V0);

30928

V1 = DAG.getBitcast(VT, V1);

30929

Sel = DAG.getBitcast(VT, Sel);

30930

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

30931

ISD::SETGT);

30932

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

30933

} else if (Subtarget.hasSSE41()) {

30934

// On SSE41 targets we can use PBLENDVB which selects bytes based just

30935

// on the sign bit.

30936

V0 = DAG.getBitcast(VT, V0);

30937

V1 = DAG.getBitcast(VT, V1);

30938

Sel = DAG.getBitcast(VT, Sel);

30939

return DAG.getBitcast(SelVT,

30940

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

30941

}

30942

// On pre-SSE41 targets we test for the sign bit by comparing to

30943

// zero - a negative value will set all bits of the lanes to true

30944

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

30945

SDValue Z = DAG.getConstant(0, dl, SelVT);

30946

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

30947

return DAG.getSelect(dl, SelVT, C, V0, V1);

30948

};

30949

30950

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

30951

// We can safely do this using i16 shifts as we're only interested in

30952

// the 3 lower bits of each byte.

30953

Amt = DAG.getBitcast(ExtVT, Amt);

30954

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

30955

Amt = DAG.getBitcast(VT, Amt);

30956

30957

if (Opc == ISD::SHL || Opc == ISD::SRL) {

30958

// r = VSELECT(r, shift(r, 4), a);

30959

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

30960

R = SignBitSelect(VT, Amt, M, R);

30961

30962

// a += a

30963

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30964

30965

// r = VSELECT(r, shift(r, 2), a);

30966

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

30967

R = SignBitSelect(VT, Amt, M, R);

30968

30969

// a += a

30970

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30971

30972

// return VSELECT(r, shift(r, 1), a);

30973

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

30974

R = SignBitSelect(VT, Amt, M, R);

30975

return R;

30976

}

30977

30978

if (Opc == ISD::SRA) {

30979

// For SRA we need to unpack each byte to the higher byte of a i16 vector

30980

// so we can correctly sign extend. We don't care what happens to the

30981

// lower byte.

30982

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

30983

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

30984

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

30985

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

30986

ALo = DAG.getBitcast(ExtVT, ALo);

30987

AHi = DAG.getBitcast(ExtVT, AHi);

30988

RLo = DAG.getBitcast(ExtVT, RLo);

30989

RHi = DAG.getBitcast(ExtVT, RHi);

30990

30991

// r = VSELECT(r, shift(r, 4), a);

30992

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

30993

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

30994

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

30995

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

30996

30997

// a += a

30998

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

30999

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31000

31001

// r = VSELECT(r, shift(r, 2), a);

31002

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

31003

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

31004

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31005

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31006

31007

// a += a

31008

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31009

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31010

31011

// r = VSELECT(r, shift(r, 1), a);

31012

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

31013

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

31014

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31015

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31016

31017

// Logical shift the result back to the lower byte, leaving a zero upper

31018

// byte meaning that we can safely pack with PACKUSWB.

31019

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

31020

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

31021

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

31022

}

31023

}

31024

31025

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

31026

MVT ExtVT = MVT::v8i32;

31027

SDValue Z = DAG.getConstant(0, dl, VT);

31028

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

31029

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

31030

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

31031

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

31032

ALo = DAG.getBitcast(ExtVT, ALo);

31033

AHi = DAG.getBitcast(ExtVT, AHi);

31034

RLo = DAG.getBitcast(ExtVT, RLo);

31035

RHi = DAG.getBitcast(ExtVT, RHi);

31036

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

31037

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

31038

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

31039

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

31040

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31041

}

31042

31043

if (VT == MVT::v8i16) {

31044

// If we have a constant shift amount, the non-SSE41 path is best as

31045

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

31046

bool UseSSE41 = Subtarget.hasSSE41() &&

31047

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31048

31049

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

31050

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

31051

// the sign bit.

31052

if (UseSSE41) {

31053

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

31054

V0 = DAG.getBitcast(ExtVT, V0);

31055

V1 = DAG.getBitcast(ExtVT, V1);

31056

Sel = DAG.getBitcast(ExtVT, Sel);

31057

return DAG.getBitcast(

31058

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

31059

}

31060

// On pre-SSE41 targets we splat the sign bit - a negative value will

31061

// set all bits of the lanes to true and VSELECT uses that in

31062

// its OR(AND(V0,C),AND(V1,~C)) lowering.

31063

SDValue C =

31064

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

31065

return DAG.getSelect(dl, VT, C, V0, V1);

31066

};

31067

31068

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

31069

if (UseSSE41) {

31070

// On SSE41 targets we need to replicate the shift mask in both

31071

// bytes for PBLENDVB.

31072

Amt = DAG.getNode(

31073

ISD::OR, dl, VT,

31074

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

31075

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

31076

} else {

31077

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

31078

}

31079

31080

// r = VSELECT(r, shift(r, 8), a);

31081

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

31082

R = SignBitSelect(Amt, M, R);

31083

31084

// a += a

31085

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31086

31087

// r = VSELECT(r, shift(r, 4), a);

31088

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

31089

R = SignBitSelect(Amt, M, R);

31090

31091

// a += a

31092

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31093

31094

// r = VSELECT(r, shift(r, 2), a);

31095

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

31096

R = SignBitSelect(Amt, M, R);

31097

31098

// a += a

31099

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31100

31101

// return VSELECT(r, shift(r, 1), a);

31102

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

31103

R = SignBitSelect(Amt, M, R);

31104

return R;

31105

}

31106

31107

// Decompose 256-bit shifts into 128-bit shifts.

31108

if (VT.is256BitVector())

31109

return splitVectorIntBinary(Op, DAG);

31110

31111

if (VT == MVT::v32i16 || VT == MVT::v64i8)

31112

return splitVectorIntBinary(Op, DAG);

31113

31114

return SDValue();

31115

}

31116

31117

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

31118

SelectionDAG &DAG) {

31119

MVT VT = Op.getSimpleValueType();

31120

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31121, __extension__
__PRETTY_FUNCTION__))

31121

"Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31121, __extension__
__PRETTY_FUNCTION__));

31122

31123

SDLoc DL(Op);

31124

SDValue Op0 = Op.getOperand(0);

31125

SDValue Op1 = Op.getOperand(1);

31126

SDValue Amt = Op.getOperand(2);

31127

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31128

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

31129

31130

if (VT.isVector()) {

31131

APInt APIntShiftAmt;

31132

bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

31133

31134

if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {

31135

if (IsFSHR)

31136

std::swap(Op0, Op1);

31137

31138

if (IsCstSplat) {

31139

uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

31140

SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

31141

return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

31142

{Op0, Op1, Imm}, DAG, Subtarget);

31143

}

31144

return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

31145

{Op0, Op1, Amt}, DAG, Subtarget);

31146

}

31147

assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31150, __extension__
__PRETTY_FUNCTION__))

31148

VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31150, __extension__
__PRETTY_FUNCTION__))

31149

VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31150, __extension__
__PRETTY_FUNCTION__))

31150

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31150, __extension__
__PRETTY_FUNCTION__));

31151

31152

// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

31153

// fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

31154

if (IsCstSplat)

31155

return SDValue();

31156

31157

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31158

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31159

bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());

31160

31161

// Constant vXi16 funnel shifts can be efficiently handled by default.

31162

if (IsCst && EltSizeInBits == 16)

31163

return SDValue();

31164

31165

unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

31166

unsigned NumElts = VT.getVectorNumElements();

31167

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31168

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31169

31170

// Split 256-bit integers on XOP/pre-AVX2 targets.

31171

// Split 512-bit integers on non 512-bit BWI targets.

31172

if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

31173

!Subtarget.hasAVX2())) ||

31174

(VT.is512BitVector() && !Subtarget.useBWIRegs() &&

31175

EltSizeInBits < 32)) {

31176

// Pre-mask the amount modulo using the wider vector.

31177

Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

31178

return splitVectorOp(Op, DAG);

31179

}

31180

31181

// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

31182

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

31183

int ScalarAmtIdx = -1;

31184

if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

31185

// Uniform vXi16 funnel shifts can be efficiently handled by default.

31186

if (EltSizeInBits == 16)

31187

return SDValue();

31188

31189

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31190

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31191

Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

31192

ScalarAmtIdx, Subtarget, DAG);

31193

Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

31194

ScalarAmtIdx, Subtarget, DAG);

31195

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31196

}

31197

}

31198

31199

MVT WideSVT = MVT::getIntegerVT(

31200

std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

31201

MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);

31202

31203

// If per-element shifts are legal, fallback to generic expansion.

31204

if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

31205

return SDValue();

31206

31207

// Attempt to fold as:

31208

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31209

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31210

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31211

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31212

Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

31213

Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

31214

AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31215

Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

31216

EltSizeInBits, DAG);

31217

SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

31218

Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

31219

if (!IsFSHR)

31220

Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

31221

EltSizeInBits, DAG);

31222

return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

31223

}

31224

31225

// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

31226

if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

31227

supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31228

SDValue Z = DAG.getConstant(0, DL, VT);

31229

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31230

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31231

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31232

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31233

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31234

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31235

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31236

}

31237

31238

// Fallback to generic expansion.

31239

return SDValue();

31240

}

31241

assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31243, __extension__
__PRETTY_FUNCTION__))

31242

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31243, __extension__
__PRETTY_FUNCTION__))

31243

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31243, __extension__
__PRETTY_FUNCTION__));

31244

31245

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

31246

bool OptForSize = DAG.shouldOptForSize();

31247

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

31248

31249

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31250

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31251

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

31252

!isa<ConstantSDNode>(Amt)) {

31253

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

31254

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

31255

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

31256

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

31257

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

31258

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

31259

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

31260

if (IsFSHR) {

31261

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

31262

} else {

31263

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

31264

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

31265

}

31266

return DAG.getZExtOrTrunc(Res, DL, VT);

31267

}

31268

31269

if (VT == MVT::i8 || ExpandFunnel)

31270

return SDValue();

31271

31272

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

31273

if (VT == MVT::i16) {

31274

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

31275

DAG.getConstant(15, DL, Amt.getValueType()));

31276

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

31277

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

31278

}

31279

31280

return Op;

31281

}

31282

31283

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

31284

SelectionDAG &DAG) {

31285

MVT VT = Op.getSimpleValueType();

31286

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31286, __extension__
__PRETTY_FUNCTION__));

31287

31288

SDLoc DL(Op);

31289

SDValue R = Op.getOperand(0);

31290

SDValue Amt = Op.getOperand(1);

31291

unsigned Opcode = Op.getOpcode();

31292

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31293

int NumElts = VT.getVectorNumElements();

31294

bool IsROTL = Opcode == ISD::ROTL;

31295

31296

// Check for constant splat rotation amount.

31297

APInt CstSplatValue;

31298

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

31299

31300

// Check for splat rotate by zero.

31301

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

31302

return R;

31303

31304

// AVX512 implicitly uses modulo rotation amounts.

31305

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

31306

// Attempt to rotate by immediate.

31307

if (IsCstSplat) {

31308

unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

31309

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31310

return DAG.getNode(RotOpc, DL, VT, R,

31311

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31312

}

31313

31314

// Else, fall-back on VPROLV/VPRORV.

31315

return Op;

31316

}

31317

31318

// AVX512 VBMI2 vXi16 - lower to funnel shifts.

31319

if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

31320

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31321

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31322

}

31323

31324

SDValue Z = DAG.getConstant(0, DL, VT);

31325

31326

if (!IsROTL) {

31327

// If the ISD::ROTR amount is constant, we're always better converting to

31328

// ISD::ROTL.

31329

if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

31330

return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);

31331

31332

// XOP targets always prefers ISD::ROTL.

31333

if (Subtarget.hasXOP())

31334

return DAG.getNode(ISD::ROTL, DL, VT, R,

31335

DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

31336

}

31337

31338

// Split 256-bit integers on XOP/pre-AVX2 targets.

31339

if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

31340

return splitVectorIntBinary(Op, DAG);

31341

31342

// XOP has 128-bit vector variable + immediate rotates.

31343

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

31344

// XOP implicitly uses modulo rotation amounts.

31345

if (Subtarget.hasXOP()) {

31346

assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31346, __extension__
__PRETTY_FUNCTION__));

31347

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31347, __extension__
__PRETTY_FUNCTION__));

31348

31349

// Attempt to rotate by immediate.

31350

if (IsCstSplat) {

31351

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31352

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

31353

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31354

}

31355

31356

// Use general rotate by variable (per-element).

31357

return Op;

31358

}

31359

31360

// Rotate by an uniform constant - expand back to shifts.

31361

if (IsCstSplat)

31362

return SDValue();

31363

31364

// Split 512-bit integers on non 512-bit BWI targets.

31365

if (VT.is512BitVector() && !Subtarget.useBWIRegs())

31366

return splitVectorIntBinary(Op, DAG);

31367

31368

assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31373, __extension__
__PRETTY_FUNCTION__))

31369

(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31373, __extension__
__PRETTY_FUNCTION__))

31370

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31373, __extension__
__PRETTY_FUNCTION__))

31371

Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31373, __extension__
__PRETTY_FUNCTION__))

31372

((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31373, __extension__
__PRETTY_FUNCTION__))

31373

"Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31373, __extension__
__PRETTY_FUNCTION__));

31374

31375

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31376

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31377

31378

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31379

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31380

31381

// Attempt to fold as unpack(x,x) << zext(splat(y)):

31382

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31383

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31384

if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

31385

int BaseRotAmtIdx = -1;

31386

if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

31387

if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

31388

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31389

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31390

}

31391

unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

31392

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31393

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31394

Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

31395

BaseRotAmtIdx, Subtarget, DAG);

31396

Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

31397

BaseRotAmtIdx, Subtarget, DAG);

31398

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31399

}

31400

}

31401

31402

// v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

31403

// the amount bit.

31404

// TODO: We're doing nothing here that we couldn't do for funnel shifts.

31405

if (EltSizeInBits == 8) {

31406

bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31407

MVT WideVT =

31408

MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);

31409

unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;

31410

31411

// Attempt to fold as:

31412

// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

31413

// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

31414

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31415

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31416

// If we're rotating by constant, just use default promotion.

31417

if (IsConstAmt)

31418

return SDValue();

31419

// See if we can perform this by widening to vXi16 or vXi32.

31420

R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

31421

R = DAG.getNode(

31422

ISD::OR, DL, WideVT, R,

31423

getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

31424

Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31425

R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

31426

if (IsROTL)

31427

R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

31428

return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

31429

}

31430

31431

// Attempt to fold as unpack(x,x) << zext(y):

31432

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31433

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31434

if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31435

// See if we can perform this by unpacking to lo/hi vXi16.

31436

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31437

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31438

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31439

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31440

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31441

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31442

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31443

}

31444

assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31444, __extension__
__PRETTY_FUNCTION__));

31445

31446

// We don't need ModuloAmt here as we just peek at individual bits.

31447

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31448

if (Subtarget.hasSSE41()) {

31449

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31450

// on the sign bit.

31451

V0 = DAG.getBitcast(VT, V0);

31452

V1 = DAG.getBitcast(VT, V1);

31453

Sel = DAG.getBitcast(VT, Sel);

31454

return DAG.getBitcast(SelVT,

31455

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

31456

}

31457

// On pre-SSE41 targets we test for the sign bit by comparing to

31458

// zero - a negative value will set all bits of the lanes to true

31459

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31460

SDValue Z = DAG.getConstant(0, DL, SelVT);

31461

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

31462

return DAG.getSelect(DL, SelVT, C, V0, V1);

31463

};

31464

31465

// ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

31466

if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

31467

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

31468

IsROTL = true;

31469

}

31470

31471

unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

31472

unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;

31473

31474

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31475

// We can safely do this using i16 shifts as we're only interested in

31476

// the 3 lower bits of each byte.

31477

Amt = DAG.getBitcast(ExtVT, Amt);

31478

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

31479

Amt = DAG.getBitcast(VT, Amt);

31480

31481

// r = VSELECT(r, rot(r, 4), a);

31482

SDValue M;

31483

M = DAG.getNode(

31484

ISD::OR, DL, VT,

31485

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

31486

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

31487

R = SignBitSelect(VT, Amt, M, R);

31488

31489

// a += a

31490

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

31491

31492

// r = VSELECT(r, rot(r, 2), a);

31493

M = DAG.getNode(

31494

ISD::OR, DL, VT,

31495

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

31496

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

31497

R = SignBitSelect(VT, Amt, M, R);

31498

31499

// a += a

31500

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

31501

31502

// return VSELECT(r, rot(r, 1), a);

31503

M = DAG.getNode(

31504

ISD::OR, DL, VT,

31505

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

31506

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

31507

return SignBitSelect(VT, Amt, M, R);

31508

}

31509

31510

bool IsSplatAmt = DAG.isSplatValue(Amt);

31511

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31512

bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

31513

supportedVectorVarShift(VT, Subtarget, ISD::SRL);

31514

31515

// Fallback for splats + all supported variable shifts.

31516

// Fallback for non-constants AVX2 vXi16 as well.

31517

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

31518

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31519

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

31520

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

31521

SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

31522

SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

31523

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

31524

}

31525

31526

// Everything below assumes ISD::ROTL.

31527

if (!IsROTL) {

31528

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

31529

IsROTL = true;

31530

}

31531

31532

// ISD::ROT* uses modulo rotate amounts.

31533

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31534

31535

assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31535, __extension__
__PRETTY_FUNCTION__));

31536

31537

// As with shifts, attempt to convert the rotation amount to a multiplication

31538

// factor, fallback to general expansion.

31539

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

31540

if (!Scale)

31541

return SDValue();

31542

31543

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

31544

if (EltSizeInBits == 16) {

31545

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

31546

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

31547

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

31548

}

31549

31550

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

31551

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

31552

// that can then be OR'd with the lower 32-bits.

31553

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31553, __extension__
__PRETTY_FUNCTION__));

31554

static const int OddMask[] = {1, -1, 3, -1};

31555

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

31556

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

31557

31558

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

31559

DAG.getBitcast(MVT::v2i64, R),

31560

DAG.getBitcast(MVT::v2i64, Scale));

31561

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

31562

DAG.getBitcast(MVT::v2i64, R13),

31563

DAG.getBitcast(MVT::v2i64, Scale13));

31564

Res02 = DAG.getBitcast(VT, Res02);

31565

Res13 = DAG.getBitcast(VT, Res13);

31566

31567

return DAG.getNode(ISD::OR, DL, VT,

31568

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

31569

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

31570

}

31571

31572

/// Returns true if the operand type is exactly twice the native width, and

31573

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

31574

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

31575

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

31576

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

31577

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

31578

31579

if (OpWidth == 64)

31580

return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

31581

if (OpWidth == 128)

31582

return Subtarget.canUseCMPXCHG16B();

31583

31584

return false;

31585

}

31586

31587

TargetLoweringBase::AtomicExpansionKind

31588

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

31589

Type *MemType = SI->getValueOperand()->getType();

31590

31591

bool NoImplicitFloatOps =

31592

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

31593

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

31594

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

31595

(Subtarget.hasSSE1() || Subtarget.hasX87()))

31596

return AtomicExpansionKind::None;

31597

31598

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

31599

: AtomicExpansionKind::None;

31600

}

31601

31602

// Note: this turns large loads into lock cmpxchg8b/16b.

31603

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

31604

TargetLowering::AtomicExpansionKind

31605

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

31606

Type *MemType = LI->getType();

31607

31608

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

31609

// can use movq to do the load. If we have X87 we can load into an 80-bit

31610

// X87 register and store it to a stack temporary.

31611

bool NoImplicitFloatOps =

31612

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

31613

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

31614

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

31615

(Subtarget.hasSSE1() || Subtarget.hasX87()))

31616

return AtomicExpansionKind::None;

31617

31618

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

31619

: AtomicExpansionKind::None;

31620

}

31621

31622

enum BitTestKind : unsigned {

31623

UndefBit,

31624

ConstantBit,

31625

NotConstantBit,

31626

ShiftBit,

31627

NotShiftBit

31628

};

31629

31630

static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {

31631

using namespace llvm::PatternMatch;

31632

BitTestKind BTK = UndefBit;

31633

auto *C = dyn_cast<ConstantInt>(V);

31634

if (C) {

31635

// Check if V is a power of 2 or NOT power of 2.

31636

if (isPowerOf2_64(C->getZExtValue()))

31637

BTK = ConstantBit;

31638

else if (isPowerOf2_64((~C->getValue()).getZExtValue()))

31639

BTK = NotConstantBit;

31640

return {V, BTK};

31641

}

31642

31643

// Check if V is some power of 2 pattern known to be non-zero

31644

auto *I = dyn_cast<Instruction>(V);

31645

if (I) {

31646

bool Not = false;

31647

// Check if we have a NOT

31648

Value *PeekI;

31649

if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||

31650

match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {

31651

Not = true;

31652

I = dyn_cast<Instruction>(PeekI);

31653

31654

// If I is constant, it will fold and we can evaluate later. If its an

31655

// argument or something of that nature, we can't analyze.

31656

if (I == nullptr)

31657

return {nullptr, UndefBit};

31658

}

31659

// We can only use 1 << X without more sophisticated analysis. C << X where

31660

// C is a power of 2 but not 1 can result in zero which cannot be translated

31661

// to bittest. Likewise any C >> X (either arith or logical) can be zero.

31662

if (I->getOpcode() == Instruction::Shl) {

31663

// Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &

31664

// -X` and some other provable power of 2 patterns that we can use CTZ on

31665

// may be profitable.

31666

// Todo(2): It may be possible in some cases to prove that Shl(C, X) is

31667

// non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also

31668

// be provably a non-zero power of 2.

31669

// Todo(3): ROTL and ROTR patterns on a power of 2 C should also be

31670

// transformable to bittest.

31671

auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));

31672

if (!ShiftVal)

31673

return {nullptr, UndefBit};

31674

if (ShiftVal->equalsInt(1))

31675

BTK = Not ? NotShiftBit : ShiftBit;

31676

31677

if (BTK == UndefBit)

31678

return {nullptr, UndefBit};

31679

31680

Value *BitV = I->getOperand(1);

31681

31682

Value *AndOp;

31683

const APInt *AndC;

31684

if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {

31685

// Read past a shiftmask instruction to find count

31686

if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))

31687

BitV = AndOp;

31688

}

31689

return {BitV, BTK};

31690

}

31691

}

31692

return {nullptr, UndefBit};

31693

}

31694

31695

TargetLowering::AtomicExpansionKind

31696

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

31697

// If the atomicrmw's result isn't actually used, we can just add a "lock"

31698

// prefix to a normal instruction for these operations.

31699

if (AI->use_empty())

31700

return AtomicExpansionKind::None;

31701

31702

// If the atomicrmw's result is used by a single bit AND, we may use

31703

// bts/btr/btc instruction for these operations.

31704

// Note: InstCombinePass can cause a de-optimization here. It replaces the

31705

// SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor

31706

// (depending on CC). This pattern can only use bts/btr/btc but we don't

31707

// detect it.

31708

Instruction *I = AI->user_back();

31709

auto BitChange = FindSingleBitChange(AI->getValOperand());

31710

if (BitChange.second == UndefBit || !AI->hasOneUse() ||

31711

I->getOpcode() != Instruction::And ||

31712

AI->getType()->getPrimitiveSizeInBits() == 8 ||

31713

AI->getParent() != I->getParent())

31714

return AtomicExpansionKind::CmpXChg;

31715

31716

unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;

31717

31718

// This is a redundant AND, it should get cleaned up elsewhere.

31719

if (AI == I->getOperand(OtherIdx))

31720

return AtomicExpansionKind::CmpXChg;

31721

31722

// The following instruction must be a AND single bit.

31723

if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {

31724

auto *C1 = cast<ConstantInt>(AI->getValOperand());

31725

auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));

31726

if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {

31727

return AtomicExpansionKind::CmpXChg;

31728

}

31729

if (AI->getOperation() == AtomicRMWInst::And) {

31730

return ~C1->getValue() == C2->getValue()

31731

? AtomicExpansionKind::BitTestIntrinsic

31732

: AtomicExpansionKind::CmpXChg;

31733

}

31734

return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

31735

: AtomicExpansionKind::CmpXChg;

31736

}

31737

31738

assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31738, __extension__
__PRETTY_FUNCTION__));

31739

31740

auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));

31741

if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)

31742

return AtomicExpansionKind::CmpXChg;

31743

31744

assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31744, __extension__
__PRETTY_FUNCTION__));

31745

31746

// If shift amounts are not the same we can't use BitTestIntrinsic.

31747

if (BitChange.first != BitTested.first)

31748

return AtomicExpansionKind::CmpXChg;

31749

31750

// If atomic AND need to be masking all be one bit and testing the one bit

31751

// unset in the mask.

31752

if (AI->getOperation() == AtomicRMWInst::And)

31753

return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)

31754

? AtomicExpansionKind::BitTestIntrinsic

31755

: AtomicExpansionKind::CmpXChg;

31756

31757

// If atomic XOR/OR need to be setting and testing the same bit.

31758

return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)

31759

? AtomicExpansionKind::BitTestIntrinsic

31760

: AtomicExpansionKind::CmpXChg;

31761

}

31762

31763

void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

31764

IRBuilder<> Builder(AI);

31765

Intrinsic::ID IID_C = Intrinsic::not_intrinsic;

31766

Intrinsic::ID IID_I = Intrinsic::not_intrinsic;

31767

switch (AI->getOperation()) {

31768

default:

31769

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 31769);

31770

case AtomicRMWInst::Or:

31771

IID_C = Intrinsic::x86_atomic_bts;

31772

IID_I = Intrinsic::x86_atomic_bts_rm;

31773

break;

31774

case AtomicRMWInst::Xor:

31775

IID_C = Intrinsic::x86_atomic_btc;

31776

IID_I = Intrinsic::x86_atomic_btc_rm;

31777

break;

31778

case AtomicRMWInst::And:

31779

IID_C = Intrinsic::x86_atomic_btr;

31780

IID_I = Intrinsic::x86_atomic_btr_rm;

31781

break;

31782

}

31783

Instruction *I = AI->user_back();

31784

LLVMContext &Ctx = AI->getContext();

31785

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

31786

Type::getInt8PtrTy(Ctx));

31787

Function *BitTest = nullptr;

31788

Value *Result = nullptr;

31789

auto BitTested = FindSingleBitChange(AI->getValOperand());

31790

assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31790, __extension__ __PRETTY_FUNCTION__));

31791

31792

if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {

31793

auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));

31794

31795

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());

31796

31797

unsigned Imm = llvm::countr_zero(C->getZExtValue());

31798

Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});

31799

} else {

31800

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());

31801

31802

assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31802, __extension__
__PRETTY_FUNCTION__));

31803

31804

Value *SI = BitTested.first;

31805

assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
31805, __extension__ __PRETTY_FUNCTION__));

31806

31807

// BT{S|R|C} on memory operand don't modulo bit position so we need to

31808

// mask it.

31809

unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();

31810

Value *BitPos =

31811

Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));

31812

// Todo(1): In many cases it may be provable that SI is less than

31813

// ShiftBits in which case this mask is unnecessary

31814

// Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1

31815

// << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in

31816

// favor of just a raw BT{S|R|C}.

31817

31818

Result = Builder.CreateCall(BitTest, {Addr, BitPos});

31819

Result = Builder.CreateZExtOrTrunc(Result, AI->getType());

31820

31821

// If the result is only used for zero/non-zero status then we don't need to

31822

// shift value back. Otherwise do so.

31823

for (auto It = I->user_begin(); It != I->user_end(); ++It) {

31824

if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {

31825

if (ICmp->isEquality()) {

31826

auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));

31827

auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));

31828

if (C0 || C1) {

31829

assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31829, __extension__ __PRETTY_FUNCTION__));

31830

if ((C0 ? C0 : C1)->isZero())

31831

continue;

31832

}

31833

}

31834

}

31835

Result = Builder.CreateShl(Result, BitPos);

31836

break;

31837

}

31838

}

31839

31840

I->replaceAllUsesWith(Result);

31841

I->eraseFromParent();

31842

AI->eraseFromParent();

31843

}

31844

31845

static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {

31846

using namespace llvm::PatternMatch;

31847

if (!AI->hasOneUse())

31848

return false;

31849

31850

Value *Op = AI->getOperand(1);

31851

ICmpInst::Predicate Pred;

31852

Instruction *I = AI->user_back();

31853

AtomicRMWInst::BinOp Opc = AI->getOperation();

31854

if (Opc == AtomicRMWInst::Add) {

31855

if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))

31856

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

31857

if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {

31858

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31859

return Pred == CmpInst::ICMP_SLT;

31860

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31861

return Pred == CmpInst::ICMP_SGT;

31862

}

31863

return false;

31864

}

31865

if (Opc == AtomicRMWInst::Sub) {

31866

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

31867

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

31868

if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {

31869

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31870

return Pred == CmpInst::ICMP_SLT;

31871

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31872

return Pred == CmpInst::ICMP_SGT;

31873

}

31874

return false;

31875

}

31876

if ((Opc == AtomicRMWInst::Or &&

31877

match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||

31878

(Opc == AtomicRMWInst::And &&

31879

match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {

31880

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31881

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||

31882

Pred == CmpInst::ICMP_SLT;

31883

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31884

return Pred == CmpInst::ICMP_SGT;

31885

return false;

31886

}

31887

if (Opc == AtomicRMWInst::Xor) {

31888

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

31889

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

31890

if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {

31891

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31892

return Pred == CmpInst::ICMP_SLT;

31893

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31894

return Pred == CmpInst::ICMP_SGT;

31895

}

31896

return false;

31897

}

31898

31899

return false;

31900

}

31901

31902

void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(

31903

AtomicRMWInst *AI) const {

31904

IRBuilder<> Builder(AI);

31905

Instruction *TempI = nullptr;

31906

LLVMContext &Ctx = AI->getContext();

31907

ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());

31908

if (!ICI) {

31909

TempI = AI->user_back();

31910

assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31910, __extension__
__PRETTY_FUNCTION__));

31911

ICI = cast<ICmpInst>(TempI->user_back());

31912

}

31913

X86::CondCode CC = X86::COND_INVALID;

31914

ICmpInst::Predicate Pred = ICI->getPredicate();

31915

switch (Pred) {

31916

default:

31917

llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31917);

31918

case CmpInst::ICMP_EQ:

31919

CC = X86::COND_E;

31920

break;

31921

case CmpInst::ICMP_NE:

31922

CC = X86::COND_NE;

31923

break;

31924

case CmpInst::ICMP_SLT:

31925

CC = X86::COND_S;

31926

break;

31927

case CmpInst::ICMP_SGT:

31928

CC = X86::COND_NS;

31929

break;

31930

}

31931

Intrinsic::ID IID = Intrinsic::not_intrinsic;

31932

switch (AI->getOperation()) {

31933

default:

31934

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 31934);

31935

case AtomicRMWInst::Add:

31936

IID = Intrinsic::x86_atomic_add_cc;

31937

break;

31938

case AtomicRMWInst::Sub:

31939

IID = Intrinsic::x86_atomic_sub_cc;

31940

break;

31941

case AtomicRMWInst::Or:

31942

IID = Intrinsic::x86_atomic_or_cc;

31943

break;

31944

case AtomicRMWInst::And:

31945

IID = Intrinsic::x86_atomic_and_cc;

31946

break;

31947

case AtomicRMWInst::Xor:

31948

IID = Intrinsic::x86_atomic_xor_cc;

31949

break;

31950

}

31951

Function *CmpArith =

31952

Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());

31953

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

31954

Type::getInt8PtrTy(Ctx));

31955

Value *Call = Builder.CreateCall(

31956

CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});

31957

Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));

31958

ICI->replaceAllUsesWith(Result);

31959

ICI->eraseFromParent();

31960

if (TempI)

31961

TempI->eraseFromParent();

31962

AI->eraseFromParent();

31963

}

31964

31965

TargetLowering::AtomicExpansionKind

31966

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

31967

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

31968

Type *MemType = AI->getType();

31969

31970

// If the operand is too big, we must see if cmpxchg8/16b is available

31971

// and default to library calls otherwise.

31972

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

31973

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

31974

: AtomicExpansionKind::None;

31975

}

31976

31977

AtomicRMWInst::BinOp Op = AI->getOperation();

31978

switch (Op) {

31979

case AtomicRMWInst::Xchg:

31980

return AtomicExpansionKind::None;

31981

case AtomicRMWInst::Add:

31982

case AtomicRMWInst::Sub:

31983

if (shouldExpandCmpArithRMWInIR(AI))

31984

return AtomicExpansionKind::CmpArithIntrinsic;

31985

// It's better to use xadd, xsub or xchg for these in other cases.

31986

return AtomicExpansionKind::None;

31987

case AtomicRMWInst::Or:

31988

case AtomicRMWInst::And:

31989

case AtomicRMWInst::Xor:

31990

if (shouldExpandCmpArithRMWInIR(AI))

31991

return AtomicExpansionKind::CmpArithIntrinsic;

31992

return shouldExpandLogicAtomicRMWInIR(AI);

31993

case AtomicRMWInst::Nand:

31994

case AtomicRMWInst::Max:

31995

case AtomicRMWInst::Min:

31996

case AtomicRMWInst::UMax:

31997

case AtomicRMWInst::UMin:

31998

case AtomicRMWInst::FAdd:

31999

case AtomicRMWInst::FSub:

32000

case AtomicRMWInst::FMax:

32001

case AtomicRMWInst::FMin:

32002

case AtomicRMWInst::UIncWrap:

32003

case AtomicRMWInst::UDecWrap:

32004

default:

32005

// These always require a non-trivial set of data operations on x86. We must

32006

// use a cmpxchg loop.

32007

return AtomicExpansionKind::CmpXChg;

32008

}

32009

}

32010

32011

LoadInst *

32012

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

32013

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32014

Type *MemType = AI->getType();

32015

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

32016

// there is no benefit in turning such RMWs into loads, and it is actually

32017

// harmful as it introduces a mfence.

32018

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

32019

return nullptr;

32020

32021

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

32022

// lowering available in lowerAtomicArith.

32023

// TODO: push more cases through this path.

32024

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

32025

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

32026

AI->use_empty())

32027

return nullptr;

32028

32029

IRBuilder<> Builder(AI);

32030

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

32031

auto SSID = AI->getSyncScopeID();

32032

// We must restrict the ordering to avoid generating loads with Release or

32033

// ReleaseAcquire orderings.

32034

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

32035

32036

// Before the load we need a fence. Here is an example lifted from

32037

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

32038

// is required:

32039

// Thread 0:

32040

// x.store(1, relaxed);

32041

// r1 = y.fetch_add(0, release);

32042

// Thread 1:

32043

// y.fetch_add(42, acquire);

32044

// r2 = x.load(relaxed);

32045

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

32046

// lowered to just a load without a fence. A mfence flushes the store buffer,

32047

// making the optimization clearly correct.

32048

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

32049

// otherwise, we might be able to be more aggressive on relaxed idempotent

32050

// rmw. In practice, they do not look useful, so we don't try to be

32051

// especially clever.

32052

if (SSID == SyncScope::SingleThread)

32053

// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at

32054

// the IR level, so we must wrap it in an intrinsic.

32055

return nullptr;

32056

32057

if (!Subtarget.hasMFence())

32058

// FIXME: it might make sense to use a locked operation here but on a

32059

// different cache-line to prevent cache-line bouncing. In practice it

32060

// is probably a small win, and x86 processors without mfence are rare

32061

// enough that we do not bother.

32062

return nullptr;

32063

32064

Function *MFence =

32065

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

32066

Builder.CreateCall(MFence, {});

32067

32068

// Finally we can emit the atomic load.

32069

LoadInst *Loaded = Builder.CreateAlignedLoad(

32070

AI->getType(), AI->getPointerOperand(), AI->getAlign());

32071

Loaded->setAtomic(Order, SSID);

32072

AI->replaceAllUsesWith(Loaded);

32073

AI->eraseFromParent();

32074

return Loaded;

32075

}

32076

32077

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

32078

if (!SI.isUnordered())

32079

return false;

32080

return ExperimentalUnorderedISEL;

32081

}

32082

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

32083

if (!LI.isUnordered())

32084

return false;

32085

return ExperimentalUnorderedISEL;

32086

}

32087

32088

32089

/// Emit a locked operation on a stack location which does not change any

32090

/// memory location, but does involve a lock prefix. Location is chosen to be

32091

/// a) very likely accessed only by a single thread to minimize cache traffic,

32092

/// and b) definitely dereferenceable. Returns the new Chain result.

32093

static SDValue emitLockedStackOp(SelectionDAG &DAG,

32094

const X86Subtarget &Subtarget, SDValue Chain,

32095

const SDLoc &DL) {

32096

// Implementation notes:

32097

// 1) LOCK prefix creates a full read/write reordering barrier for memory

32098

// operations issued by the current processor. As such, the location

32099

// referenced is not relevant for the ordering properties of the instruction.

32100

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

32101

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

32102

// 2) Using an immediate operand appears to be the best encoding choice

32103

// here since it doesn't require an extra register.

32104

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

32105

// is small enough it might just be measurement noise.)

32106

// 4) When choosing offsets, there are several contributing factors:

32107

// a) If there's no redzone, we default to TOS. (We could allocate a cache

32108

// line aligned stack object to improve this case.)

32109

// b) To minimize our chances of introducing a false dependence, we prefer

32110

// to offset the stack usage from TOS slightly.

32111

// c) To minimize concerns about cross thread stack usage - in particular,

32112

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

32113

// captures state in the TOS frame and accesses it from many threads -

32114

// we want to use an offset such that the offset is in a distinct cache

32115

// line from the TOS frame.

32116

//

32117

// For a general discussion of the tradeoffs and benchmark results, see:

32118

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

32119

32120

auto &MF = DAG.getMachineFunction();

32121

auto &TFL = *Subtarget.getFrameLowering();

32122

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

32123

32124

if (Subtarget.is64Bit()) {

32125

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32126

SDValue Ops[] = {

32127

DAG.getRegister(X86::RSP, MVT::i64), // Base

32128

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32129

DAG.getRegister(0, MVT::i64), // Index

32130

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32131

DAG.getRegister(0, MVT::i16), // Segment.

32132

Zero,

32133

Chain};

32134

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32135

MVT::Other, Ops);

32136

return SDValue(Res, 1);

32137

}

32138

32139

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32140

SDValue Ops[] = {

32141

DAG.getRegister(X86::ESP, MVT::i32), // Base

32142

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32143

DAG.getRegister(0, MVT::i32), // Index

32144

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32145

DAG.getRegister(0, MVT::i16), // Segment.

32146

Zero,

32147

Chain

32148

};

32149

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32150

MVT::Other, Ops);

32151

return SDValue(Res, 1);

32152

}

32153

32154

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

32155

SelectionDAG &DAG) {

32156

SDLoc dl(Op);

32157

AtomicOrdering FenceOrdering =

32158

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

32159

SyncScope::ID FenceSSID =

32160

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

32161

32162

// The only fence that needs an instruction is a sequentially-consistent

32163

// cross-thread fence.

32164

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

32165

FenceSSID == SyncScope::System) {

32166

if (Subtarget.hasMFence())

32167

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

32168

32169

SDValue Chain = Op.getOperand(0);

32170

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

32171

}

32172

32173

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32174

return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

32175

}

32176

32177

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

32178

SelectionDAG &DAG) {

32179

MVT T = Op.getSimpleValueType();

32180

SDLoc DL(Op);

32181

unsigned Reg = 0;

32182

unsigned size = 0;

32183

switch(T.SimpleTy) {

32184

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32184);

32185

case MVT::i8: Reg = X86::AL; size = 1; break;

32186

case MVT::i16: Reg = X86::AX; size = 2; break;

32187

case MVT::i32: Reg = X86::EAX; size = 4; break;

32188

case MVT::i64:

32189

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32189, __extension__
__PRETTY_FUNCTION__));

32190

Reg = X86::RAX; size = 8;

32191

break;

32192

}

32193

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

32194

Op.getOperand(2), SDValue());

32195

SDValue Ops[] = { cpIn.getValue(0),

32196

Op.getOperand(1),

32197

Op.getOperand(3),

32198

DAG.getTargetConstant(size, DL, MVT::i8),

32199

cpIn.getValue(1) };

32200

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

32201

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

32202

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

32203

Ops, T, MMO);

32204

32205

SDValue cpOut =

32206

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

32207

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

32208

MVT::i32, cpOut.getValue(2));

32209

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

32210

32211

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

32212

cpOut, Success, EFLAGS.getValue(1));

32213

}

32214

32215

// Create MOVMSKB, taking into account whether we need to split for AVX1.

32216

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

32217

const X86Subtarget &Subtarget) {

32218

MVT InVT = V.getSimpleValueType();

32219

32220

if (InVT == MVT::v64i8) {

32221

SDValue Lo, Hi;

32222

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32223

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

32224

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

32225

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

32226

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

32227

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

32228

DAG.getConstant(32, DL, MVT::i8));

32229

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

32230

}

32231

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

32232

SDValue Lo, Hi;

32233

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32234

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

32235

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

32236

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

32237

DAG.getConstant(16, DL, MVT::i8));

32238

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

32239

}

32240

32241

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

32242

}

32243

32244

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

32245

SelectionDAG &DAG) {

32246

SDValue Src = Op.getOperand(0);

32247

MVT SrcVT = Src.getSimpleValueType();

32248

MVT DstVT = Op.getSimpleValueType();

32249

32250

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

32251

// half to v32i1 and concatenating the result.

32252

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

32253

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32253, __extension__
__PRETTY_FUNCTION__));

32254

assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32254, __extension__
__PRETTY_FUNCTION__));

32255

SDLoc dl(Op);

32256

SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

32257

DAG.getIntPtrConstant(0, dl));

32258

Lo = DAG.getBitcast(MVT::v32i1, Lo);

32259

SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

32260

DAG.getIntPtrConstant(1, dl));

32261

Hi = DAG.getBitcast(MVT::v32i1, Hi);

32262

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

32263

}

32264

32265

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

32266

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

32267

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32267, __extension__
__PRETTY_FUNCTION__));

32268

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

32269

SDLoc DL(Op);

32270

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

32271

V = getPMOVMSKB(DL, V, DAG, Subtarget);

32272

return DAG.getZExtOrTrunc(V, DL, DstVT);

32273

}

32274

32275

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32276, __extension__
__PRETTY_FUNCTION__))

32276

SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32276, __extension__
__PRETTY_FUNCTION__));

32277

32278

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32278, __extension__
__PRETTY_FUNCTION__));

32279

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

32280

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

32281

// This conversion needs to be expanded.

32282

return SDValue();

32283

32284

SDLoc dl(Op);

32285

if (SrcVT.isVector()) {

32286

// Widen the vector in input in the case of MVT::v2i32.

32287

// Example: from MVT::v2i32 to MVT::v4i32.

32288

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

32289

SrcVT.getVectorNumElements() * 2);

32290

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

32291

DAG.getUNDEF(SrcVT));

32292

} else {

32293

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32294, __extension__
__PRETTY_FUNCTION__))

32294

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32294, __extension__
__PRETTY_FUNCTION__));

32295

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

32296

}

32297

32298

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

32299

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

32300

32301

if (DstVT == MVT::x86mmx)

32302

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

32303

32304

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

32305

DAG.getIntPtrConstant(0, dl));

32306

}

32307

32308

/// Compute the horizontal sum of bytes in V for the elements of VT.

32309

///

32310

/// Requires V to be a byte vector and VT to be an integer vector type with

32311

/// wider elements than V's type. The width of the elements of VT determines

32312

/// how many bytes of V are summed horizontally to produce each element of the

32313

/// result.

32314

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

32315

const X86Subtarget &Subtarget,

32316

SelectionDAG &DAG) {

32317

SDLoc DL(V);

32318

MVT ByteVecVT = V.getSimpleValueType();

32319

MVT EltVT = VT.getVectorElementType();

32320

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__))

32321

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32321, __extension__
__PRETTY_FUNCTION__));

32322

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32323, __extension__
__PRETTY_FUNCTION__))

32323

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32323, __extension__
__PRETTY_FUNCTION__));

32324

unsigned VecSize = VT.getSizeInBits();

32325

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32325, __extension__
__PRETTY_FUNCTION__));

32326

32327

// PSADBW instruction horizontally add all bytes and leave the result in i64

32328

// chunks, thus directly computes the pop count for v2i64 and v4i64.

32329

if (EltVT == MVT::i64) {

32330

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

32331

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32332

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

32333

return DAG.getBitcast(VT, V);

32334

}

32335

32336

if (EltVT == MVT::i32) {

32337

// We unpack the low half and high half into i32s interleaved with zeros so

32338

// that we can use PSADBW to horizontally sum them. The most useful part of

32339

// this is that it lines up the results of two PSADBW instructions to be

32340

// two v2i64 vectors which concatenated are the 4 population counts. We can

32341

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

32342

SDValue Zeros = DAG.getConstant(0, DL, VT);

32343

SDValue V32 = DAG.getBitcast(VT, V);

32344

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

32345

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

32346

32347

// Do the horizontal sums into two v2i64s.

32348

Zeros = DAG.getConstant(0, DL, ByteVecVT);

32349

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32350

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32351

DAG.getBitcast(ByteVecVT, Low), Zeros);

32352

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32353

DAG.getBitcast(ByteVecVT, High), Zeros);

32354

32355

// Merge them together.

32356

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

32357

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

32358

DAG.getBitcast(ShortVecVT, Low),

32359

DAG.getBitcast(ShortVecVT, High));

32360

32361

return DAG.getBitcast(VT, V);

32362

}

32363

32364

// The only element type left is i16.

32365

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32365, __extension__
__PRETTY_FUNCTION__));

32366

32367

// To obtain pop count for each i16 element starting from the pop count for

32368

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

32369

// right by 8. It is important to shift as i16s as i8 vector shift isn't

32370

// directly supported.

32371

SDValue ShifterV = DAG.getConstant(8, DL, VT);

32372

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32373

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

32374

DAG.getBitcast(ByteVecVT, V));

32375

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32376

}

32377

32378

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

32379

const X86Subtarget &Subtarget,

32380

SelectionDAG &DAG) {

32381

MVT VT = Op.getSimpleValueType();

32382

MVT EltVT = VT.getVectorElementType();

32383

int NumElts = VT.getVectorNumElements();

32384

(void)EltVT;

32385

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32385, __extension__
__PRETTY_FUNCTION__));

32386

32387

// Implement a lookup table in register by using an algorithm based on:

32388

// http://wm.ite.pl/articles/sse-popcount.html

32389

//

32390

// The general idea is that every lower byte nibble in the input vector is an

32391

// index into a in-register pre-computed pop count table. We then split up the

32392

// input vector in two new ones: (1) a vector with only the shifted-right

32393

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

32394

// masked out higher ones) for each byte. PSHUFB is used separately with both

32395

// to index the in-register table. Next, both are added and the result is a

32396

// i8 vector where each element contains the pop count for input byte.

32397

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

32398

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

32399

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

32400

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

32401

32402

SmallVector<SDValue, 64> LUTVec;

32403

for (int i = 0; i < NumElts; ++i)

32404

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

32405

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

32406

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

32407

32408

// High nibbles

32409

SDValue FourV = DAG.getConstant(4, DL, VT);

32410

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

32411

32412

// Low nibbles

32413

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

32414

32415

// The input vector is used as the shuffle mask that index elements into the

32416

// LUT. After counting low and high nibbles, add the vector to obtain the

32417

// final pop count per i8 element.

32418

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

32419

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

32420

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

32421

}

32422

32423

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

32424

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

32425

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32426

SelectionDAG &DAG) {

32427

MVT VT = Op.getSimpleValueType();

32428

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32429, __extension__
__PRETTY_FUNCTION__))

32429

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32429, __extension__
__PRETTY_FUNCTION__));

32430

SDLoc DL(Op.getNode());

32431

SDValue Op0 = Op.getOperand(0);

32432

32433

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

32434

if (Subtarget.hasVPOPCNTDQ()) {

32435

unsigned NumElems = VT.getVectorNumElements();

32436

assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32437, __extension__
__PRETTY_FUNCTION__))

32437

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32437, __extension__
__PRETTY_FUNCTION__));

32438

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

32439

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

32440

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

32441

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

32442

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

32443

}

32444

}

32445

32446

// Decompose 256-bit ops into smaller 128-bit ops.

32447

if (VT.is256BitVector() && !Subtarget.hasInt256())

32448

return splitVectorIntUnary(Op, DAG);

32449

32450

// Decompose 512-bit ops into smaller 256-bit ops.

32451

if (VT.is512BitVector() && !Subtarget.hasBWI())

32452

return splitVectorIntUnary(Op, DAG);

32453

32454

// For element types greater than i8, do vXi8 pop counts and a bytesum.

32455

if (VT.getScalarType() != MVT::i8) {

32456

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

32457

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

32458

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

32459

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

32460

}

32461

32462

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

32463

if (!Subtarget.hasSSSE3())

32464

return SDValue();

32465

32466

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

32467

}

32468

32469

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32470

SelectionDAG &DAG) {

32471

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32472, __extension__
__PRETTY_FUNCTION__))

32472

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32472, __extension__
__PRETTY_FUNCTION__));

32473

return LowerVectorCTPOP(Op, Subtarget, DAG);

32474

}

32475

32476

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

32477

MVT VT = Op.getSimpleValueType();

32478

SDValue In = Op.getOperand(0);

32479

SDLoc DL(Op);

32480

32481

// For scalars, its still beneficial to transfer to/from the SIMD unit to

32482

// perform the BITREVERSE.

32483

if (!VT.isVector()) {

32484

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

32485

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

32486

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

32487

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

32488

DAG.getIntPtrConstant(0, DL));

32489

}

32490

32491

int NumElts = VT.getVectorNumElements();

32492

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

32493

32494

// Decompose 256-bit ops into smaller 128-bit ops.

32495

if (VT.is256BitVector())

32496

return splitVectorIntUnary(Op, DAG);

32497

32498

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32499, __extension__
__PRETTY_FUNCTION__))

32499

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32499, __extension__
__PRETTY_FUNCTION__));

32500

32501

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

32502

// perform the BSWAP in the shuffle.

32503

// Its best to shuffle using the second operand as this will implicitly allow

32504

// memory folding for multiple vectors.

32505

SmallVector<SDValue, 16> MaskElts;

32506

for (int i = 0; i != NumElts; ++i) {

32507

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

32508

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

32509

int PermuteByte = SourceByte | (2 << 5);

32510

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

32511

}

32512

}

32513

32514

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

32515

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

32516

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

32517

Res, Mask);

32518

return DAG.getBitcast(VT, Res);

32519

}

32520

32521

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

32522

SelectionDAG &DAG) {

32523

MVT VT = Op.getSimpleValueType();

32524

32525

if (Subtarget.hasXOP() && !VT.is512BitVector())

32526

return LowerBITREVERSE_XOP(Op, DAG);

32527

32528

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32528, __extension__
__PRETTY_FUNCTION__));

32529

32530

SDValue In = Op.getOperand(0);

32531

SDLoc DL(Op);

32532

32533

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32534, __extension__
__PRETTY_FUNCTION__))

32534

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32534, __extension__
__PRETTY_FUNCTION__));

32535

32536

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

32537

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

32538

return splitVectorIntUnary(Op, DAG);

32539

32540

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

32541

if (VT == MVT::v32i8 && !Subtarget.hasInt256())

32542

return splitVectorIntUnary(Op, DAG);

32543

32544

unsigned NumElts = VT.getVectorNumElements();

32545

32546

// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

32547

if (Subtarget.hasGFNI()) {

32548

MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);

32549

SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);

32550

Matrix = DAG.getBitcast(VT, Matrix);

32551

return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

32552

DAG.getTargetConstant(0, DL, MVT::i8));

32553

}

32554

32555

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

32556

// two nibbles and a PSHUFB lookup to find the bitreverse of each

32557

// 0-15 value (moved to the other nibble).

32558

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

32559

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

32560

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

32561

32562

const int LoLUT[16] = {

32563

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

32564

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

32565

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

32566

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

32567

const int HiLUT[16] = {

32568

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

32569

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

32570

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

32571

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

32572

32573

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

32574

for (unsigned i = 0; i < NumElts; ++i) {

32575

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

32576

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

32577

}

32578

32579

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

32580

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

32581

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

32582

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

32583

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

32584

}

32585

32586

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

32587

SelectionDAG &DAG) {

32588

SDLoc DL(Op);

32589

SDValue X = Op.getOperand(0);

32590

MVT VT = Op.getSimpleValueType();

32591

32592

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

32593

if (VT == MVT::i8 ||

32594

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

32595

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

32596

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

32597

DAG.getConstant(0, DL, MVT::i8));

32598

// Copy the inverse of the parity flag into a register with setcc.

32599

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

32600

// Extend to the original type.

32601

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

32602

}

32603

32604

// If we have POPCNT, use the default expansion.

32605

if (Subtarget.hasPOPCNT())

32606

return SDValue();

32607

32608

if (VT == MVT::i64) {

32609

// Xor the high and low 16-bits together using a 32-bit operation.

32610

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

32611

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

32612

DAG.getConstant(32, DL, MVT::i8)));

32613

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

32614

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

32615

}

32616

32617

if (VT != MVT::i16) {

32618

// Xor the high and low 16-bits together using a 32-bit operation.

32619

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

32620

DAG.getConstant(16, DL, MVT::i8));

32621

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

32622

} else {

32623

// If the input is 16-bits, we need to extend to use an i32 shift below.

32624

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

32625

}

32626

32627

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

32628

// This should allow an h-reg to be used to save a shift.

32629

SDValue Hi = DAG.getNode(

32630

ISD::TRUNCATE, DL, MVT::i8,

32631

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

32632

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

32633

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

32634

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

32635

32636

// Copy the inverse of the parity flag into a register with setcc.

32637

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

32638

// Extend to the original type.

32639

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

32640

}

32641

32642

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

32643

const X86Subtarget &Subtarget) {

32644

unsigned NewOpc = 0;

32645

switch (N->getOpcode()) {

32646

case ISD::ATOMIC_LOAD_ADD:

32647

NewOpc = X86ISD::LADD;

32648

break;

32649

case ISD::ATOMIC_LOAD_SUB:

32650

NewOpc = X86ISD::LSUB;

32651

break;

32652

case ISD::ATOMIC_LOAD_OR:

32653

NewOpc = X86ISD::LOR;

32654

break;

32655

case ISD::ATOMIC_LOAD_XOR:

32656

NewOpc = X86ISD::LXOR;

32657

break;

32658

case ISD::ATOMIC_LOAD_AND:

32659

NewOpc = X86ISD::LAND;

32660

break;

32661

default:

32662

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32662);

32663

}

32664

32665

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

32666

32667

return DAG.getMemIntrinsicNode(

32668

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

32669

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

32670

/*MemVT=*/N->getSimpleValueType(0), MMO);

32671

}

32672

32673

/// Lower atomic_load_ops into LOCK-prefixed operations.

32674

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

32675

const X86Subtarget &Subtarget) {

32676

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

32677

SDValue Chain = N->getOperand(0);

32678

SDValue LHS = N->getOperand(1);

32679

SDValue RHS = N->getOperand(2);

32680

unsigned Opc = N->getOpcode();

32681

MVT VT = N->getSimpleValueType(0);

32682

SDLoc DL(N);

32683

32684

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

32685

// can only be lowered when the result is unused. They should have already

32686

// been transformed into a cmpxchg loop in AtomicExpand.

32687

if (N->hasAnyUseOfValue(0)) {

32688

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

32689

// select LXADD if LOCK_SUB can't be selected.

32690

if (Opc == ISD::ATOMIC_LOAD_SUB) {

32691

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

32692

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

32693

RHS, AN->getMemOperand());

32694

}

32695

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32696, __extension__
__PRETTY_FUNCTION__))

32696

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32696, __extension__
__PRETTY_FUNCTION__));

32697

return N;

32698

}

32699

32700

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

32701

// The core idea here is that since the memory location isn't actually

32702

// changing, all we need is a lowering for the *ordering* impacts of the

32703

// atomicrmw. As such, we can chose a different operation and memory

32704

// location to minimize impact on other code.

32705

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

32706

// On X86, the only ordering which actually requires an instruction is

32707

// seq_cst which isn't SingleThread, everything just needs to be preserved

32708

// during codegen and then dropped. Note that we expect (but don't assume),

32709

// that orderings other than seq_cst and acq_rel have been canonicalized to

32710

// a store or load.

32711

if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

32712

AN->getSyncScopeID() == SyncScope::System) {

32713

// Prefer a locked operation against a stack location to minimize cache

32714

// traffic. This assumes that stack locations are very likely to be

32715

// accessed only by the owning thread.

32716

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

32717

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32717, __extension__ __PRETTY_FUNCTION__));

32718

// NOTE: The getUNDEF is needed to give something for the unused result 0.

32719

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

32720

DAG.getUNDEF(VT), NewChain);

32721

}

32722

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32723

SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);

32724

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32724, __extension__ __PRETTY_FUNCTION__));

32725

// NOTE: The getUNDEF is needed to give something for the unused result 0.

32726

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

32727

DAG.getUNDEF(VT), NewChain);

32728

}

32729

32730

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

32731

// RAUW the chain, but don't worry about the result, as it's unused.

32732

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32732, __extension__ __PRETTY_FUNCTION__));

32733

// NOTE: The getUNDEF is needed to give something for the unused result 0.

32734

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

32735

DAG.getUNDEF(VT), LockOp.getValue(1));

32736

}

32737

32738

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

32739

const X86Subtarget &Subtarget) {

32740

auto *Node = cast<AtomicSDNode>(Op.getNode());

32741

SDLoc dl(Node);

32742

EVT VT = Node->getMemoryVT();

32743

32744

bool IsSeqCst =

32745

Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

32746

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

32747

32748

// If this store is not sequentially consistent and the type is legal

32749

// we can just keep it.

32750

if (!IsSeqCst && IsTypeLegal)

32751

return Op;

32752

32753

if (VT == MVT::i64 && !IsTypeLegal) {

32754

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

32755

// is enabled.

32756

bool NoImplicitFloatOps =

32757

DAG.getMachineFunction().getFunction().hasFnAttribute(

32758

Attribute::NoImplicitFloat);

32759

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

32760

SDValue Chain;

32761

if (Subtarget.hasSSE1()) {

32762

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

32763

Node->getOperand(2));

32764

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

32765

SclToVec = DAG.getBitcast(StVT, SclToVec);

32766

SDVTList Tys = DAG.getVTList(MVT::Other);

32767

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

32768

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

32769

MVT::i64, Node->getMemOperand());

32770

} else if (Subtarget.hasX87()) {

32771

// First load this into an 80-bit X87 register using a stack temporary.

32772

// This will put the whole integer into the significand.

32773

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

32774

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

32775

MachinePointerInfo MPI =

32776

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

32777

Chain =

32778

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

32779

MPI, MaybeAlign(), MachineMemOperand::MOStore);

32780

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

32781

SDValue LdOps[] = {Chain, StackPtr};

32782

SDValue Value = DAG.getMemIntrinsicNode(

32783

X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

32784

/*Align*/ std::nullopt, MachineMemOperand::MOLoad);

32785

Chain = Value.getValue(1);

32786

32787

// Now use an FIST to do the atomic store.

32788

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

32789

Chain =

32790

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

32791

StoreOps, MVT::i64, Node->getMemOperand());

32792

}

32793

32794

if (Chain) {

32795

// If this is a sequentially consistent store, also emit an appropriate

32796

// barrier.

32797

if (IsSeqCst)

32798

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

32799

32800

return Chain;

32801

}

32802

}

32803

}

32804

32805

// Convert seq_cst store -> xchg

32806

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

32807

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

32808

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

32809

Node->getMemoryVT(),

32810

Node->getOperand(0),

32811

Node->getOperand(1), Node->getOperand(2),

32812

Node->getMemOperand());

32813

return Swap.getValue(1);

32814

}

32815

32816

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

32817

SDNode *N = Op.getNode();

32818

MVT VT = N->getSimpleValueType(0);

32819

unsigned Opc = Op.getOpcode();

32820

32821

// Let legalize expand this if it isn't a legal type yet.

32822

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

32823

return SDValue();

32824

32825

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

32826

SDLoc DL(N);

32827

32828

// Set the carry flag.

32829

SDValue Carry = Op.getOperand(2);

32830

EVT CarryVT = Carry.getValueType();

32831

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

32832

Carry, DAG.getAllOnesConstant(DL, CarryVT));

32833

32834

bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;

32835

SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

32836

Op.getOperand(0), Op.getOperand(1),

32837

Carry.getValue(1));

32838

32839

bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

32840

SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

32841

Sum.getValue(1), DL, DAG);

32842

if (N->getValueType(1) == MVT::i1)

32843

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

32844

32845

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

32846

}

32847

32848

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

32849

SelectionDAG &DAG) {

32850

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32850, __extension__
__PRETTY_FUNCTION__));

32851

32852

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

32853

// which returns the values as { float, float } (in XMM0) or

32854

// { double, double } (which is returned in XMM0, XMM1).

32855

SDLoc dl(Op);

32856

SDValue Arg = Op.getOperand(0);

32857

EVT ArgVT = Arg.getValueType();

32858

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

32859

32860

TargetLowering::ArgListTy Args;

32861

TargetLowering::ArgListEntry Entry;

32862

32863

Entry.Node = Arg;

32864

Entry.Ty = ArgTy;

32865

Entry.IsSExt = false;

32866

Entry.IsZExt = false;

32867

Args.push_back(Entry);

32868

32869

bool isF64 = ArgVT == MVT::f64;

32870

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

32871

// the small struct {f32, f32} is returned in (eax, edx). For f64,

32872

// the results are returned via SRet in memory.

32873

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

32874

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

32875

const char *LibcallName = TLI.getLibcallName(LC);

32876

SDValue Callee =

32877

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

32878

32879

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

32880

: (Type *)FixedVectorType::get(ArgTy, 4);

32881

32882

TargetLowering::CallLoweringInfo CLI(DAG);

32883

CLI.setDebugLoc(dl)

32884

.setChain(DAG.getEntryNode())

32885

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

32886

32887

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

32888

32889

if (isF64)

32890

// Returned in xmm0 and xmm1.

32891

return CallResult.first;

32892

32893

// Returned in bits 0:31 and 32:64 xmm0.

32894

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

32895

CallResult.first, DAG.getIntPtrConstant(0, dl));

32896

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

32897

CallResult.first, DAG.getIntPtrConstant(1, dl));

32898

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

32899

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

32900

}

32901

32902

/// Widen a vector input to a vector of NVT. The

32903

/// input vector must have the same element type as NVT.

32904

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

32905

bool FillWithZeroes = false) {

32906

// Check if InOp already has the right width.

32907

MVT InVT = InOp.getSimpleValueType();

32908

if (InVT == NVT)

32909

return InOp;

32910

32911

if (InOp.isUndef())

32912

return DAG.getUNDEF(NVT);

32913

32914

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32915, __extension__
__PRETTY_FUNCTION__))

32915

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32915, __extension__
__PRETTY_FUNCTION__));

32916

32917

unsigned InNumElts = InVT.getVectorNumElements();

32918

unsigned WidenNumElts = NVT.getVectorNumElements();

32919

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__))

32920

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32920, __extension__
__PRETTY_FUNCTION__));

32921

32922

SDLoc dl(InOp);

32923

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

32924

InOp.getNumOperands() == 2) {

32925

SDValue N1 = InOp.getOperand(1);

32926

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

32927

N1.isUndef()) {

32928

InOp = InOp.getOperand(0);

32929

InVT = InOp.getSimpleValueType();

32930

InNumElts = InVT.getVectorNumElements();

32931

}

32932

}

32933

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

32934

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

32935

SmallVector<SDValue, 16> Ops;

32936

for (unsigned i = 0; i < InNumElts; ++i)

32937

Ops.push_back(InOp.getOperand(i));

32938

32939

EVT EltVT = InOp.getOperand(0).getValueType();

32940

32941

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

32942

DAG.getUNDEF(EltVT);

32943

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

32944

Ops.push_back(FillVal);

32945

return DAG.getBuildVector(NVT, dl, Ops);

32946

}

32947

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

32948

DAG.getUNDEF(NVT);

32949

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

32950

InOp, DAG.getIntPtrConstant(0, dl));

32951

}

32952

32953

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

32954

SelectionDAG &DAG) {

32955

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32956, __extension__
__PRETTY_FUNCTION__))

32956

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32956, __extension__
__PRETTY_FUNCTION__));

32957

32958

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

32959

SDValue Src = N->getValue();

32960

MVT VT = Src.getSimpleValueType();

32961

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32961, __extension__
__PRETTY_FUNCTION__));

32962

SDLoc dl(Op);

32963

32964

SDValue Scale = N->getScale();

32965

SDValue Index = N->getIndex();

32966

SDValue Mask = N->getMask();

32967

SDValue Chain = N->getChain();

32968

SDValue BasePtr = N->getBasePtr();

32969

32970

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

32971

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32971, __extension__
__PRETTY_FUNCTION__));

32972

// If the index is v2i64 and we have VLX we can use xmm for data and index.

32973

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

32974

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

32975

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

32976

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

32977

SDVTList VTs = DAG.getVTList(MVT::Other);

32978

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

32979

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

32980

N->getMemoryVT(), N->getMemOperand());

32981

}

32982

return SDValue();

32983

}

32984

32985

MVT IndexVT = Index.getSimpleValueType();

32986

32987

// If the index is v2i32, we're being called by type legalization and we

32988

// should just let the default handling take care of it.

32989

if (IndexVT == MVT::v2i32)

32990

return SDValue();

32991

32992

// If we don't have VLX and neither the passthru or index is 512-bits, we

32993

// need to widen until one is.

32994

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

32995

!Index.getSimpleValueType().is512BitVector()) {

32996

// Determine how much we need to widen by to get a 512-bit type.

32997

unsigned Factor = std::min(512/VT.getSizeInBits(),

32998

512/IndexVT.getSizeInBits());

32999

unsigned NumElts = VT.getVectorNumElements() * Factor;

33000

33001

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33002

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33003

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33004

33005

Src = ExtendToType(Src, VT, DAG);

33006

Index = ExtendToType(Index, IndexVT, DAG);

33007

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33008

}

33009

33010

SDVTList VTs = DAG.getVTList(MVT::Other);

33011

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33012

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33013

N->getMemoryVT(), N->getMemOperand());

33014

}

33015

33016

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

33017

SelectionDAG &DAG) {

33018

33019

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

33020

MVT VT = Op.getSimpleValueType();

33021

MVT ScalarVT = VT.getScalarType();

33022

SDValue Mask = N->getMask();

33023

MVT MaskVT = Mask.getSimpleValueType();

33024

SDValue PassThru = N->getPassThru();

33025

SDLoc dl(Op);

33026

33027

// Handle AVX masked loads which don't support passthru other than 0.

33028

if (MaskVT.getVectorElementType() != MVT::i1) {

33029

// We also allow undef in the isel pattern.

33030

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

33031

return Op;

33032

33033

SDValue NewLoad = DAG.getMaskedLoad(

33034

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33035

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

33036

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

33037

N->isExpandingLoad());

33038

// Emit a blend.

33039

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

33040

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

33041

}

33042

33043

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33044, __extension__
__PRETTY_FUNCTION__))

33044

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33044, __extension__
__PRETTY_FUNCTION__));

33045

33046

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33047, __extension__
__PRETTY_FUNCTION__))

33047

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33047, __extension__
__PRETTY_FUNCTION__));

33048

33049

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33050, __extension__
__PRETTY_FUNCTION__))

33050

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33050, __extension__
__PRETTY_FUNCTION__));

33051

33052

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__))

33053

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__))

33054

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__))

33055

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__));

33056

33057

// This operation is legal for targets with VLX, but without

33058

// VLX the vector should be widened to 512 bit

33059

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

33060

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33061

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

33062

33063

// Mask element has to be i1.

33064

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33065, __extension__
__PRETTY_FUNCTION__))

33065

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33065, __extension__
__PRETTY_FUNCTION__));

33066

33067

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33068

33069

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33070

SDValue NewLoad = DAG.getMaskedLoad(

33071

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33072

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

33073

N->getExtensionType(), N->isExpandingLoad());

33074

33075

SDValue Extract =

33076

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

33077

DAG.getIntPtrConstant(0, dl));

33078

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

33079

return DAG.getMergeValues(RetOps, dl);

33080

}

33081

33082

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

33083

SelectionDAG &DAG) {

33084

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

33085

SDValue DataToStore = N->getValue();

33086

MVT VT = DataToStore.getSimpleValueType();

33087

MVT ScalarVT = VT.getScalarType();

33088

SDValue Mask = N->getMask();

33089

SDLoc dl(Op);

33090

33091

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33092, __extension__
__PRETTY_FUNCTION__))

33092

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33092, __extension__
__PRETTY_FUNCTION__));

33093

33094

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33095, __extension__
__PRETTY_FUNCTION__))

33095

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33095, __extension__
__PRETTY_FUNCTION__));

33096

33097

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33098, __extension__
__PRETTY_FUNCTION__))

33098

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33098, __extension__
__PRETTY_FUNCTION__));

33099

33100

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33103, __extension__
__PRETTY_FUNCTION__))

33101

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33103, __extension__
__PRETTY_FUNCTION__))

33102

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33103, __extension__
__PRETTY_FUNCTION__))

33103

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33103, __extension__
__PRETTY_FUNCTION__));

33104

33105

// This operation is legal for targets with VLX, but without

33106

// VLX the vector should be widened to 512 bit

33107

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

33108

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33109

33110

// Mask element has to be i1.

33111

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33112, __extension__
__PRETTY_FUNCTION__))

33112

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33112, __extension__
__PRETTY_FUNCTION__));

33113

33114

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33115

33116

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

33117

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33118

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

33119

N->getOffset(), Mask, N->getMemoryVT(),

33120

N->getMemOperand(), N->getAddressingMode(),

33121

N->isTruncatingStore(), N->isCompressingStore());

33122

}

33123

33124

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

33125

SelectionDAG &DAG) {

33126

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33127, __extension__
__PRETTY_FUNCTION__))

33127

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33127, __extension__
__PRETTY_FUNCTION__));

33128

33129

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

33130

SDLoc dl(Op);

33131

MVT VT = Op.getSimpleValueType();

33132

SDValue Index = N->getIndex();

33133

SDValue Mask = N->getMask();

33134

SDValue PassThru = N->getPassThru();

33135

MVT IndexVT = Index.getSimpleValueType();

33136

33137

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33137, __extension__
__PRETTY_FUNCTION__));

33138

33139

// If the index is v2i32, we're being called by type legalization.

33140

if (IndexVT == MVT::v2i32)

33141

return SDValue();

33142

33143

// If we don't have VLX and neither the passthru or index is 512-bits, we

33144

// need to widen until one is.

33145

MVT OrigVT = VT;

33146

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

33147

!IndexVT.is512BitVector()) {

33148

// Determine how much we need to widen by to get a 512-bit type.

33149

unsigned Factor = std::min(512/VT.getSizeInBits(),

33150

512/IndexVT.getSizeInBits());

33151

33152

unsigned NumElts = VT.getVectorNumElements() * Factor;

33153

33154

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33155

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33156

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33157

33158

PassThru = ExtendToType(PassThru, VT, DAG);

33159

Index = ExtendToType(Index, IndexVT, DAG);

33160

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33161

}

33162

33163

// Break dependency on the data register.

33164

if (PassThru.isUndef())

33165

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

33166

33167

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

33168

N->getScale() };

33169

SDValue NewGather = DAG.getMemIntrinsicNode(

33170

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

33171

N->getMemOperand());

33172

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

33173

NewGather, DAG.getIntPtrConstant(0, dl));

33174

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

33175

}

33176

33177

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

33178

SDLoc dl(Op);

33179

SDValue Src = Op.getOperand(0);

33180

MVT DstVT = Op.getSimpleValueType();

33181

33182

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

33183

unsigned SrcAS = N->getSrcAddressSpace();

33184

33185

assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__))

33186

"addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__
__PRETTY_FUNCTION__));

33187

33188

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

33189

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

33190

} else if (DstVT == MVT::i64) {

33191

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

33192

} else if (DstVT == MVT::i32) {

33193

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

33194

} else {

33195

report_fatal_error("Bad address space in addrspacecast");

33196

}

33197

return Op;

33198

}

33199

33200

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

33201

SelectionDAG &DAG) const {

33202

// TODO: Eventually, the lowering of these nodes should be informed by or

33203

// deferred to the GC strategy for the function in which they appear. For

33204

// now, however, they must be lowered to something. Since they are logically

33205

// no-ops in the case of a null GC strategy (or a GC strategy which does not

33206

// require special handling for these nodes), lower them as literal NOOPs for

33207

// the time being.

33208

SmallVector<SDValue, 2> Ops;

33209

Ops.push_back(Op.getOperand(0));

33210

if (Op->getGluedNode())

33211

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

33212

33213

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

33214

return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

33215

}

33216

33217

// Custom split CVTPS2PH with wide types.

33218

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

33219

SDLoc dl(Op);

33220

EVT VT = Op.getValueType();

33221

SDValue Lo, Hi;

33222

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

33223

EVT LoVT, HiVT;

33224

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33225

SDValue RC = Op.getOperand(1);

33226

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

33227

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

33228

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33229

}

33230

33231

static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,

33232

unsigned OpNo) {

33233

const APInt Operand(32, OpNo);

33234

std::string OpNoStr = llvm::toString(Operand, 10, false);

33235

std::string Str(" $");

33236

33237

std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)

33238

std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}

33239

33240

auto I = StringRef::npos;

33241

for (auto &AsmStr : AsmStrs) {

33242

// Match the OpNo string. We should match exactly to exclude match

33243

// sub-string, e.g. "$12" contain "$1"

33244

if (AsmStr.endswith(OpNoStr1))

33245

I = AsmStr.size() - OpNoStr1.size();

33246

33247

// Get the index of operand in AsmStr.

33248

if (I == StringRef::npos)

33249

I = AsmStr.find(OpNoStr1 + ",");

33250

if (I == StringRef::npos)

33251

I = AsmStr.find(OpNoStr2);

33252

33253

if (I == StringRef::npos)

33254

continue;

33255

33256

assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33256, __extension__
__PRETTY_FUNCTION__));

33257

// Remove the operand string and label (if exsit).

33258

// For example:

33259

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"

33260

// ==>

33261

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr "

33262

// ==>

33263

// "call dword ptr "

33264

auto TmpStr = AsmStr.substr(0, I);

33265

I = TmpStr.rfind(':');

33266

if (I == StringRef::npos)

33267

return TmpStr;

33268

33269

assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33269, __extension__
__PRETTY_FUNCTION__));

33270

auto Asm = TmpStr.drop_front(I + 1);

33271

return Asm;

33272

}

33273

33274

return StringRef();

33275

}

33276

33277

bool X86TargetLowering::isInlineAsmTargetBranch(

33278

const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {

33279

StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);

33280

33281

if (InstrStr.contains("call"))

33282

return true;

33283

33284

return false;

33285

}

33286

33287

/// Provide custom lowering hooks for some operations.

33288

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

33289

switch (Op.getOpcode()) {

33290

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33290);

33291

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

33292

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

33293

return LowerCMP_SWAP(Op, Subtarget, DAG);

33294

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

33295

case ISD::ATOMIC_LOAD_ADD:

33296

case ISD::ATOMIC_LOAD_SUB:

33297

case ISD::ATOMIC_LOAD_OR:

33298

case ISD::ATOMIC_LOAD_XOR:

33299

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

33300

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

33301

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

33302

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

33303

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

33304

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

33305

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

33306

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

33307

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

33308

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

33309

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

33310

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

33311

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

33312

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

33313

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

33314

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

33315

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

33316

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

33317

case ISD::SHL_PARTS:

33318

case ISD::SRA_PARTS:

33319

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

33320

case ISD::FSHL:

33321

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

33322

case ISD::STRICT_SINT_TO_FP:

33323

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

33324

case ISD::STRICT_UINT_TO_FP:

33325

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

33326

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

33327

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

33328

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

33329

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

33330

case ISD::ZERO_EXTEND_VECTOR_INREG:

33331

case ISD::SIGN_EXTEND_VECTOR_INREG:

33332

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

33333

case ISD::FP_TO_SINT:

33334

case ISD::STRICT_FP_TO_SINT:

33335

case ISD::FP_TO_UINT:

33336

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

33337

case ISD::FP_TO_SINT_SAT:

33338

case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);

33339

case ISD::FP_EXTEND:

33340

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

33341

case ISD::FP_ROUND:

33342

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

33343

case ISD::FP16_TO_FP:

33344

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

33345

case ISD::FP_TO_FP16:

33346

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

33347

case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);

33348

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

33349

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

33350

case ISD::FADD:

33351

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

33352

case ISD::FROUND: return LowerFROUND(Op, DAG);

33353

case ISD::FABS:

33354

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

33355

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

33356

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

33357

case ISD::LRINT:

33358

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

33359

case ISD::SETCC:

33360

case ISD::STRICT_FSETCC:

33361

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

33362

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

33363

case ISD::SELECT: return LowerSELECT(Op, DAG);

33364

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

33365

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

33366

case ISD::VASTART: return LowerVASTART(Op, DAG);

33367

case ISD::VAARG: return LowerVAARG(Op, DAG);

33368

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

33369

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

33370

case ISD::INTRINSIC_VOID:

33371

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

33372

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

33373

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

33374

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

33375

case ISD::FRAME_TO_ARGS_OFFSET:

33376

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

33377

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

33378

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

33379

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

33380

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

33381

case ISD::EH_SJLJ_SETUP_DISPATCH:

33382

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

33383

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

33384

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

33385

case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);

33386

case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);

33387

case ISD::CTLZ:

33388

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

33389

case ISD::CTTZ:

33390

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

33391

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

33392

case ISD::MULHS:

33393

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

33394

case ISD::ROTL:

33395

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

33396

case ISD::SRA:

33397

case ISD::SRL:

33398

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

33399

case ISD::SADDO:

33400

case ISD::UADDO:

33401

case ISD::SSUBO:

33402

case ISD::USUBO: return LowerXALUO(Op, DAG);

33403

case ISD::SMULO:

33404

case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);

33405

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

33406

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

33407

case ISD::SADDO_CARRY:

33408

case ISD::SSUBO_CARRY:

33409

case ISD::ADDCARRY:

33410

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

33411

case ISD::ADD:

33412

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

33413

case ISD::UADDSAT:

33414

case ISD::SADDSAT:

33415

case ISD::USUBSAT:

33416

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

33417

case ISD::SMAX:

33418

case ISD::SMIN:

33419

case ISD::UMAX:

33420

case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);

33421

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

33422

case ISD::ABDS:

33423

case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);

33424

case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);

33425

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

33426

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

33427

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

33428

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

33429

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

33430

case ISD::GC_TRANSITION_START:

33431

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

33432

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

33433

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

33434

}

33435

}

33436

33437

/// Replace a node with an illegal result type with a new node built out of

33438

/// custom code.

33439

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

33440

SmallVectorImpl<SDValue>&Results,

33441

SelectionDAG &DAG) const {

33442

SDLoc dl(N);

33443

switch (N->getOpcode()) {

33444

default:

33445

#ifndef NDEBUG

33446

dbgs() << "ReplaceNodeResults: ";

33447

N->dump(&DAG);

33448

#endif

33449

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33449);

33450

case X86ISD::CVTPH2PS: {

33451

EVT VT = N->getValueType(0);

33452

SDValue Lo, Hi;

33453

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

33454

EVT LoVT, HiVT;

33455

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33456

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

33457

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

33458

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33459

Results.push_back(Res);

33460

return;

33461

}

33462

case X86ISD::STRICT_CVTPH2PS: {

33463

EVT VT = N->getValueType(0);

33464

SDValue Lo, Hi;

33465

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

33466

EVT LoVT, HiVT;

33467

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33468

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

33469

{N->getOperand(0), Lo});

33470

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

33471

{N->getOperand(0), Hi});

33472

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

33473

Lo.getValue(1), Hi.getValue(1));

33474

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33475

Results.push_back(Res);

33476

Results.push_back(Chain);

33477

return;

33478

}

33479

case X86ISD::CVTPS2PH:

33480

Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

33481

return;

33482

case ISD::CTPOP: {

33483

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33483, __extension__
__PRETTY_FUNCTION__));

33484

// Use a v2i64 if possible.

33485

bool NoImplicitFloatOps =

33486

DAG.getMachineFunction().getFunction().hasFnAttribute(

33487

Attribute::NoImplicitFloat);

33488

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

33489

SDValue Wide =

33490

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

33491

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

33492

// Bit count should fit in 32-bits, extract it as that and then zero

33493

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

33494

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

33495

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

33496

DAG.getIntPtrConstant(0, dl));

33497

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

33498

Results.push_back(Wide);

33499

}

33500

return;

33501

}

33502

case ISD::MUL: {

33503

EVT VT = N->getValueType(0);

33504

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33505, __extension__
__PRETTY_FUNCTION__))

33505

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33505, __extension__
__PRETTY_FUNCTION__));

33506

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

33507

// elements are needed.

33508

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

33509

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

33510

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

33511

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

33512

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33513

unsigned NumConcats = 16 / VT.getVectorNumElements();

33514

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

33515

ConcatOps[0] = Res;

33516

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

33517

Results.push_back(Res);

33518

return;

33519

}

33520

case ISD::SMULO:

33521

case ISD::UMULO: {

33522

EVT VT = N->getValueType(0);

33523

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33524, __extension__
__PRETTY_FUNCTION__))

33524

VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33524, __extension__
__PRETTY_FUNCTION__));

33525

bool IsSigned = N->getOpcode() == ISD::SMULO;

33526

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

33527

SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));

33528

SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));

33529

SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);

33530

// Extract the high 32 bits from each result using PSHUFD.

33531

// TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.

33532

SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);

33533

Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});

33534

Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,

33535

DAG.getIntPtrConstant(0, dl));

33536

33537

// Truncate the low bits of the result. This will become PSHUFD.

33538

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33539

33540

SDValue HiCmp;

33541

if (IsSigned) {

33542

// SMULO overflows if the high bits don't match the sign of the low.

33543

HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));

33544

} else {

33545

// UMULO overflows if the high bits are non-zero.

33546

HiCmp = DAG.getConstant(0, dl, VT);

33547

}

33548

SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);

33549

33550

// Widen the result with by padding with undef.

33551

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

33552

DAG.getUNDEF(VT));

33553

Results.push_back(Res);

33554

Results.push_back(Ovf);

33555

return;

33556

}

33557

case X86ISD::VPMADDWD: {

33558

// Legalize types for X86ISD::VPMADDWD by widening.

33559

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33559, __extension__
__PRETTY_FUNCTION__));

33560

33561

EVT VT = N->getValueType(0);

33562

EVT InVT = N->getOperand(0).getValueType();

33563

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33564, __extension__
__PRETTY_FUNCTION__))

33564

"Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33564, __extension__
__PRETTY_FUNCTION__));

33565

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33566, __extension__
__PRETTY_FUNCTION__))

33566

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33566, __extension__
__PRETTY_FUNCTION__));

33567

unsigned NumConcat = 128 / InVT.getSizeInBits();

33568

33569

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

33570

InVT.getVectorElementType(),

33571

NumConcat * InVT.getVectorNumElements());

33572

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

33573

VT.getVectorElementType(),

33574

NumConcat * VT.getVectorNumElements());

33575

33576

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

33577

Ops[0] = N->getOperand(0);

33578

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

33579

Ops[0] = N->getOperand(1);

33580

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

33581

33582

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

33583

Results.push_back(Res);

33584

return;

33585

}

33586

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

33587

case X86ISD::FMINC:

33588

case X86ISD::FMIN:

33589

case X86ISD::FMAXC:

33590

case X86ISD::FMAX: {

33591

EVT VT = N->getValueType(0);

33592

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33592, __extension__
__PRETTY_FUNCTION__));

33593

SDValue UNDEF = DAG.getUNDEF(VT);

33594

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

33595

N->getOperand(0), UNDEF);

33596

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

33597

N->getOperand(1), UNDEF);

33598

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

33599

return;

33600

}

33601

case ISD::SDIV:

33602

case ISD::UDIV:

33603

case ISD::SREM:

33604

case ISD::UREM: {

33605

EVT VT = N->getValueType(0);

33606

if (VT.isVector()) {

33607

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33608, __extension__
__PRETTY_FUNCTION__))

33608

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33608, __extension__
__PRETTY_FUNCTION__));

33609

// If this RHS is a constant splat vector we can widen this and let

33610

// division/remainder by constant optimize it.

33611

// TODO: Can we do something for non-splat?

33612

APInt SplatVal;

33613

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

33614

unsigned NumConcats = 128 / VT.getSizeInBits();

33615

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

33616

Ops0[0] = N->getOperand(0);

33617

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

33618

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

33619

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

33620

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

33621

Results.push_back(Res);

33622

}

33623

return;

33624

}

33625

33626

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

33627

Results.push_back(V);

33628

return;

33629

}

33630

case ISD::TRUNCATE: {

33631

MVT VT = N->getSimpleValueType(0);

33632

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

33633

return;

33634

33635

// The generic legalizer will try to widen the input type to the same

33636

// number of elements as the widened result type. But this isn't always

33637

// the best thing so do some custom legalization to avoid some cases.

33638

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

33639

SDValue In = N->getOperand(0);

33640

EVT InVT = In.getValueType();

33641

33642

unsigned InBits = InVT.getSizeInBits();

33643

if (128 % InBits == 0) {

33644

// 128 bit and smaller inputs should avoid truncate all together and

33645

// just use a build_vector that will become a shuffle.

33646

// TODO: Widen and use a shuffle directly?

33647

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

33648

EVT EltVT = VT.getVectorElementType();

33649

unsigned WidenNumElts = WidenVT.getVectorNumElements();

33650

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

33651

// Use the original element count so we don't do more scalar opts than

33652

// necessary.

33653

unsigned MinElts = VT.getVectorNumElements();

33654

for (unsigned i=0; i < MinElts; ++i) {

33655

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

33656

DAG.getIntPtrConstant(i, dl));

33657

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

33658

}

33659

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

33660

return;

33661

}

33662

// With AVX512 there are some cases that can use a target specific

33663

// truncate node to go from 256/512 to less than 128 with zeros in the

33664

// upper elements of the 128 bit result.

33665

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

33666

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

33667

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

33668

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

33669

return;

33670

}

33671

// There's one case we can widen to 512 bits and use VTRUNC.

33672

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

33673

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

33674

DAG.getUNDEF(MVT::v4i64));

33675

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

33676

return;

33677

}

33678

}

33679

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

33680

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

33681

isTypeLegal(MVT::v4i64)) {

33682

// Input needs to be split and output needs to widened. Let's use two

33683

// VTRUNCs, and shuffle their results together into the wider type.

33684

SDValue Lo, Hi;

33685

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

33686

33687

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

33688

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

33689

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

33690

{ 0, 1, 2, 3, 16, 17, 18, 19,

33691

-1, -1, -1, -1, -1, -1, -1, -1 });

33692

Results.push_back(Res);

33693

return;

33694

}

33695

33696

return;

33697

}

33698

case ISD::ANY_EXTEND:

33699

// Right now, only MVT::v8i8 has Custom action for an illegal type.

33700

// It's intended to custom handle the input type.

33701

assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33702, __extension__
__PRETTY_FUNCTION__))

33702

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33702, __extension__
__PRETTY_FUNCTION__));

33703

return;

33704

case ISD::SIGN_EXTEND:

33705

case ISD::ZERO_EXTEND: {

33706

EVT VT = N->getValueType(0);

33707

SDValue In = N->getOperand(0);

33708

EVT InVT = In.getValueType();

33709

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

33710

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

33711

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33712, __extension__
__PRETTY_FUNCTION__))

33712

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33712, __extension__
__PRETTY_FUNCTION__));

33713

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33713, __extension__
__PRETTY_FUNCTION__));

33714

// Custom split this so we can extend i8/i16->i32 invec. This is better

33715

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

33716

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

33717

// we allow the sra from the extend to i32 to be shared by the split.

33718

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

33719

33720

// Fill a vector with sign bits for each element.

33721

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

33722

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

33723

33724

// Create an unpackl and unpackh to interleave the sign bits then bitcast

33725

// to v2i64.

33726

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

33727

{0, 4, 1, 5});

33728

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

33729

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

33730

{2, 6, 3, 7});

33731

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

33732

33733

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33734

Results.push_back(Res);

33735

return;

33736

}

33737

33738

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

33739

if (!InVT.is128BitVector()) {

33740

// Not a 128 bit vector, but maybe type legalization will promote

33741

// it to 128 bits.

33742

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

33743

return;

33744

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

33745

if (!InVT.is128BitVector())

33746

return;

33747

33748

// Promote the input to 128 bits. Type legalization will turn this into

33749

// zext_inreg/sext_inreg.

33750

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

33751

}

33752

33753

// Perform custom splitting instead of the two stage extend we would get

33754

// by default.

33755

EVT LoVT, HiVT;

33756

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

33757

assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33757, __extension__
__PRETTY_FUNCTION__));

33758

33759

SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);

33760

33761

// We need to shift the input over by half the number of elements.

33762

unsigned NumElts = InVT.getVectorNumElements();

33763

unsigned HalfNumElts = NumElts / 2;

33764

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

33765

for (unsigned i = 0; i != HalfNumElts; ++i)

33766

ShufMask[i] = i + HalfNumElts;

33767

33768

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

33769

Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);

33770

33771

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33772

Results.push_back(Res);

33773

}

33774

return;

33775

}

33776

case ISD::FP_TO_SINT:

33777

case ISD::STRICT_FP_TO_SINT:

33778

case ISD::FP_TO_UINT:

33779

case ISD::STRICT_FP_TO_UINT: {

33780

bool IsStrict = N->isStrictFPOpcode();

33781

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

33782

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

33783

EVT VT = N->getValueType(0);

33784

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

33785

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

33786

EVT SrcVT = Src.getValueType();

33787

33788

SDValue Res;

33789

if (isSoftFP16(SrcVT)) {

33790

EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

33791

if (IsStrict) {

33792

Res =

33793

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

33794

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

33795

{NVT, MVT::Other}, {Chain, Src})});

33796

Chain = Res.getValue(1);

33797

} else {

33798

Res = DAG.getNode(N->getOpcode(), dl, VT,

33799

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

33800

}

33801

Results.push_back(Res);

33802

if (IsStrict)

33803

Results.push_back(Chain);

33804

33805

return;

33806

}

33807

33808

if (VT.isVector() && Subtarget.hasFP16() &&

33809

SrcVT.getVectorElementType() == MVT::f16) {

33810

EVT EleVT = VT.getVectorElementType();

33811

EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

33812

33813

if (SrcVT != MVT::v8f16) {

33814

SDValue Tmp =

33815

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

33816

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

33817

Ops[0] = Src;

33818

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

33819

}

33820

33821

if (IsStrict) {

33822

unsigned Opc =

33823

IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

33824

Res =

33825

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

33826

Chain = Res.getValue(1);

33827

} else {

33828

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

33829

Res = DAG.getNode(Opc, dl, ResVT, Src);

33830

}

33831

33832

// TODO: Need to add exception check code for strict FP.

33833

if (EleVT.getSizeInBits() < 16) {

33834

MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

33835

Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);

33836

33837

// Now widen to 128 bits.

33838

unsigned NumConcats = 128 / TmpVT.getSizeInBits();

33839

MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

33840

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

33841

ConcatOps[0] = Res;

33842

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

33843

}

33844

33845

Results.push_back(Res);

33846

if (IsStrict)

33847

Results.push_back(Chain);

33848

33849

return;

33850

}

33851

33852

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

33853

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33854, __extension__
__PRETTY_FUNCTION__))

33854

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33854, __extension__
__PRETTY_FUNCTION__));

33855

33856

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

33857

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

33858

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

33859

VT.getVectorNumElements());

33860

SDValue Res;

33861

SDValue Chain;

33862

if (IsStrict) {

33863

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

33864

{N->getOperand(0), Src});

33865

Chain = Res.getValue(1);

33866

} else

33867

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

33868

33869

// Preserve what we know about the size of the original result. If the

33870

// result is v2i32, we have to manually widen the assert.

33871

if (PromoteVT == MVT::v2i32)

33872

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

33873

DAG.getUNDEF(MVT::v2i32));

33874

33875

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

33876

Res.getValueType(), Res,

33877

DAG.getValueType(VT.getVectorElementType()));

33878

33879

if (PromoteVT == MVT::v2i32)

33880

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

33881

DAG.getIntPtrConstant(0, dl));

33882

33883

// Truncate back to the original width.

33884

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33885

33886

// Now widen to 128 bits.

33887

unsigned NumConcats = 128 / VT.getSizeInBits();

33888

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

33889

VT.getVectorNumElements() * NumConcats);

33890

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

33891

ConcatOps[0] = Res;

33892

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

33893

Results.push_back(Res);

33894

if (IsStrict)

33895

Results.push_back(Chain);

33896

return;

33897

}

33898

33899

33900

if (VT == MVT::v2i32) {

33901

assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33902, __extension__
__PRETTY_FUNCTION__))

33902

"Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33902, __extension__
__PRETTY_FUNCTION__));

33903

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__
__PRETTY_FUNCTION__));

33904

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33905, __extension__
__PRETTY_FUNCTION__))

33905

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33905, __extension__
__PRETTY_FUNCTION__));

33906

if (Src.getValueType() == MVT::v2f64) {

33907

if (!IsSigned && !Subtarget.hasAVX512()) {

33908

SDValue Res =

33909

expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

33910

Results.push_back(Res);

33911

return;

33912

}

33913

33914

unsigned Opc;

33915

if (IsStrict)

33916

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

33917

else

33918

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

33919

33920

// If we have VLX we can emit a target specific FP_TO_UINT node,.

33921

if (!IsSigned && !Subtarget.hasVLX()) {

33922

// Otherwise we can defer to the generic legalizer which will widen

33923

// the input as well. This will be further widened during op

33924

// legalization to v8i32<-v8f64.

33925

// For strict nodes we'll need to widen ourselves.

33926

// FIXME: Fix the type legalizer to safely widen strict nodes?

33927

if (!IsStrict)

33928

return;

33929

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

33930

DAG.getConstantFP(0.0, dl, MVT::v2f64));

33931

Opc = N->getOpcode();

33932

}

33933

SDValue Res;

33934

SDValue Chain;

33935

if (IsStrict) {

33936

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

33937

{N->getOperand(0), Src});

33938

Chain = Res.getValue(1);

33939

} else {

33940

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

33941

}

33942

Results.push_back(Res);

33943

if (IsStrict)

33944

Results.push_back(Chain);

33945

return;

33946

}

33947

33948

// Custom widen strict v2f32->v2i32 by padding with zeros.

33949

// FIXME: Should generic type legalizer do this?

33950

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

33951

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

33952

DAG.getConstantFP(0.0, dl, MVT::v2f32));

33953

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

33954

{N->getOperand(0), Src});

33955

Results.push_back(Res);

33956

Results.push_back(Res.getValue(1));

33957

return;

33958

}

33959

33960

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

33961

// so early out here.

33962

return;

33963

}

33964

33965

assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33965, __extension__
__PRETTY_FUNCTION__));

33966

33967

if ((Subtarget.hasDQI() && VT == MVT::i64 &&

33968

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

33969

(Subtarget.hasFP16() && SrcVT == MVT::f16)) {

33970

assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33970, __extension__
__PRETTY_FUNCTION__));

33971

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

33972

// If we use a 128-bit result we might need to use a target specific node.

33973

unsigned SrcElts =

33974

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

33975

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

33976

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

33977

unsigned Opc = N->getOpcode();

33978

if (NumElts != SrcElts) {

33979

if (IsStrict)

33980

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

33981

else

33982

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

33983

}

33984

33985

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

33986

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

33987

DAG.getConstantFP(0.0, dl, VecInVT), Src,

33988

ZeroIdx);

33989

SDValue Chain;

33990

if (IsStrict) {

33991

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

33992

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

33993

Chain = Res.getValue(1);

33994

} else

33995

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

33996

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

33997

Results.push_back(Res);

33998

if (IsStrict)

33999

Results.push_back(Chain);

34000

return;

34001

}

34002

34003

if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

34004

SDValue Chain;

34005

SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

34006

Results.push_back(V);

34007

if (IsStrict)

34008

Results.push_back(Chain);

34009

return;

34010

}

34011

34012

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

34013

Results.push_back(V);

34014

if (IsStrict)

34015

Results.push_back(Chain);

34016

}

34017

return;

34018

}

34019

case ISD::LRINT:

34020

case ISD::LLRINT: {

34021

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

34022

Results.push_back(V);

34023

return;

34024

}

34025

34026

case ISD::SINT_TO_FP:

34027

case ISD::STRICT_SINT_TO_FP:

34028

case ISD::UINT_TO_FP:

34029

case ISD::STRICT_UINT_TO_FP: {

34030

bool IsStrict = N->isStrictFPOpcode();

34031

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

34032

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

34033

EVT VT = N->getValueType(0);

34034

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34035

if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

34036

Subtarget.hasVLX()) {

34037

if (Src.getValueType().getVectorElementType() == MVT::i16)

34038

return;

34039

34040

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

34041

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34042

IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

34043

: DAG.getUNDEF(MVT::v2i32));

34044

if (IsStrict) {

34045

unsigned Opc =

34046

IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

34047

SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

34048

{N->getOperand(0), Src});

34049

Results.push_back(Res);

34050

Results.push_back(Res.getValue(1));

34051

} else {

34052

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34053

Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

34054

}

34055

return;

34056

}

34057

if (VT != MVT::v2f32)

34058

return;

34059

EVT SrcVT = Src.getValueType();

34060

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

34061

if (IsStrict) {

34062

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

34063

: X86ISD::STRICT_CVTUI2P;

34064

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

34065

{N->getOperand(0), Src});

34066

Results.push_back(Res);

34067

Results.push_back(Res.getValue(1));

34068

} else {

34069

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34070

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

34071

}

34072

return;

34073

}

34074

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

34075

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

34076

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

34077

SDValue One = DAG.getConstant(1, dl, SrcVT);

34078

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

34079

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

34080

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

34081

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

34082

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

34083

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

34084

for (int i = 0; i != 2; ++i) {

34085

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

34086

SignSrc, DAG.getIntPtrConstant(i, dl));

34087

if (IsStrict)

34088

SignCvts[i] =

34089

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

34090

{N->getOperand(0), Elt});

34091

else

34092

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

34093

};

34094

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

34095

SDValue Slow, Chain;

34096

if (IsStrict) {

34097

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

34098

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

34099

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

34100

{Chain, SignCvt, SignCvt});

34101

Chain = Slow.getValue(1);

34102

} else {

34103

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

34104

}

34105

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

34106

IsNeg =

34107

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

34108

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

34109

Results.push_back(Cvt);

34110

if (IsStrict)

34111

Results.push_back(Chain);

34112

return;

34113

}

34114

34115

if (SrcVT != MVT::v2i32)

34116

return;

34117

34118

if (IsSigned || Subtarget.hasAVX512()) {

34119

if (!IsStrict)

34120

return;

34121

34122

// Custom widen strict v2i32->v2f32 to avoid scalarization.

34123

// FIXME: Should generic type legalizer do this?

34124

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34125

DAG.getConstant(0, dl, MVT::v2i32));

34126

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

34127

{N->getOperand(0), Src});

34128

Results.push_back(Res);

34129

Results.push_back(Res.getValue(1));

34130

return;

34131

}

34132

34133

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34133, __extension__
__PRETTY_FUNCTION__));

34134

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

34135

SDValue VBias = DAG.getConstantFP(

34136

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);

34137

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

34138

DAG.getBitcast(MVT::v2i64, VBias));

34139

Or = DAG.getBitcast(MVT::v2f64, Or);

34140

if (IsStrict) {

34141

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

34142

{N->getOperand(0), Or, VBias});

34143

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

34144

{MVT::v4f32, MVT::Other},

34145

{Sub.getValue(1), Sub});

34146

Results.push_back(Res);

34147

Results.push_back(Res.getValue(1));

34148

} else {

34149

// TODO: Are there any fast-math-flags to propagate here?

34150

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

34151

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

34152

}

34153

return;

34154

}

34155

case ISD::STRICT_FP_ROUND:

34156

case ISD::FP_ROUND: {

34157

bool IsStrict = N->isStrictFPOpcode();

34158

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34159

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34160

SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);

34161

EVT SrcVT = Src.getValueType();

34162

EVT VT = N->getValueType(0);

34163

SDValue V;

34164

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

34165

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

34166

: DAG.getUNDEF(MVT::v2f32);

34167

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

34168

}

34169

if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {

34170

assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34170, __extension__
__PRETTY_FUNCTION__));

34171

if (SrcVT.getVectorElementType() != MVT::f32)

34172

return;

34173

34174

if (IsStrict)

34175

V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

34176

{Chain, Src, Rnd});

34177

else

34178

V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);

34179

34180

Results.push_back(DAG.getBitcast(MVT::v8f16, V));

34181

if (IsStrict)

34182

Results.push_back(V.getValue(1));

34183

return;

34184

}

34185

if (!isTypeLegal(Src.getValueType()))

34186

return;

34187

EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

34188

if (IsStrict)

34189

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

34190

{Chain, Src});

34191

else

34192

V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

34193

Results.push_back(V);

34194

if (IsStrict)

34195

Results.push_back(V.getValue(1));

34196

return;

34197

}

34198

case ISD::FP_EXTEND:

34199

case ISD::STRICT_FP_EXTEND: {

34200

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

34201

// No other ValueType for FP_EXTEND should reach this point.

34202

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34203, __extension__
__PRETTY_FUNCTION__))

34203

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34203, __extension__
__PRETTY_FUNCTION__));

34204

if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

34205

return;

34206

bool IsStrict = N->isStrictFPOpcode();

34207

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34208

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

34209

: DAG.getUNDEF(MVT::v2f16);

34210

SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

34211

if (IsStrict)

34212

V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

34213

{N->getOperand(0), V});

34214

else

34215

V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

34216

Results.push_back(V);

34217

if (IsStrict)

34218

Results.push_back(V.getValue(1));

34219

return;

34220

}

34221

case ISD::INTRINSIC_W_CHAIN: {

34222

unsigned IntNo = N->getConstantOperandVal(1);

34223

switch (IntNo) {

34224

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34225)

34225

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34225);

34226

case Intrinsic::x86_rdtsc:

34227

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

34228

Results);

34229

case Intrinsic::x86_rdtscp:

34230

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

34231

Results);

34232

case Intrinsic::x86_rdpmc:

34233

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

34234

Results);

34235

return;

34236

case Intrinsic::x86_rdpru:

34237

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,

34238

Results);

34239

return;

34240

case Intrinsic::x86_xgetbv:

34241

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

34242

Results);

34243

return;

34244

}

34245

}

34246

case ISD::READCYCLECOUNTER: {

34247

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

34248

}

34249

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

34250

EVT T = N->getValueType(0);

34251

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34251, __extension__
__PRETTY_FUNCTION__));

34252

bool Regs64bit = T == MVT::i128;

34253

assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34254, __extension__
__PRETTY_FUNCTION__))

34254

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34254, __extension__
__PRETTY_FUNCTION__));

34255

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

34256

SDValue cpInL, cpInH;

34257

cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

34258

DAG.getConstant(0, dl, HalfT));

34259

cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

34260

DAG.getConstant(1, dl, HalfT));

34261

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

34262

Regs64bit ? X86::RAX : X86::EAX,

34263

cpInL, SDValue());

34264

cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,

34265

Regs64bit ? X86::RDX : X86::EDX,

34266

cpInH, cpInL.getValue(1));

34267

SDValue swapInL, swapInH;

34268

swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

34269

DAG.getConstant(0, dl, HalfT));

34270

swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

34271

DAG.getConstant(1, dl, HalfT));

34272

swapInH =

34273

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

34274

swapInH, cpInH.getValue(1));

34275

34276

// In 64-bit mode we might need the base pointer in RBX, but we can't know

34277

// until later. So we keep the RBX input in a vreg and use a custom

34278

// inserter.

34279

// Since RBX will be a reserved register the register allocator will not

34280

// make sure its value will be properly saved and restored around this

34281

// live-range.

34282

SDValue Result;

34283

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

34284

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

34285

if (Regs64bit) {

34286

SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

34287

swapInH.getValue(1)};

34288

Result =

34289

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

34290

} else {

34291

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

34292

swapInH.getValue(1));

34293

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

34294

swapInL.getValue(1)};

34295

Result =

34296

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

34297

}

34298

34299

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

34300

Regs64bit ? X86::RAX : X86::EAX,

34301

HalfT, Result.getValue(1));

34302

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

34303

Regs64bit ? X86::RDX : X86::EDX,

34304

HalfT, cpOutL.getValue(2));

34305

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

34306

34307

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

34308

MVT::i32, cpOutH.getValue(2));

34309

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

34310

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

34311

34312

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

34313

Results.push_back(Success);

34314

Results.push_back(EFLAGS.getValue(1));

34315

return;

34316

}

34317

case ISD::ATOMIC_LOAD: {

34318

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34318, __extension__
__PRETTY_FUNCTION__));

34319

bool NoImplicitFloatOps =

34320

DAG.getMachineFunction().getFunction().hasFnAttribute(

34321

Attribute::NoImplicitFloat);

34322

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

34323

auto *Node = cast<AtomicSDNode>(N);

34324

if (Subtarget.hasSSE1()) {

34325

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

34326

// Then extract the lower 64-bits.

34327

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

34328

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

34329

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34330

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34331

MVT::i64, Node->getMemOperand());

34332

if (Subtarget.hasSSE2()) {

34333

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

34334

DAG.getIntPtrConstant(0, dl));

34335

Results.push_back(Res);

34336

Results.push_back(Ld.getValue(1));

34337

return;

34338

}

34339

// We use an alternative sequence for SSE1 that extracts as v2f32 and

34340

// then casts to i64. This avoids a 128-bit stack temporary being

34341

// created by type legalization if we were to cast v4f32->v2i64.

34342

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

34343

DAG.getIntPtrConstant(0, dl));

34344

Res = DAG.getBitcast(MVT::i64, Res);

34345

Results.push_back(Res);

34346

Results.push_back(Ld.getValue(1));

34347

return;

34348

}

34349

if (Subtarget.hasX87()) {

34350

// First load this into an 80-bit X87 register. This will put the whole

34351

// integer into the significand.

34352

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

34353

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34354

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

34355

dl, Tys, Ops, MVT::i64,

34356

Node->getMemOperand());

34357

SDValue Chain = Result.getValue(1);

34358

34359

// Now store the X87 register to a stack temporary and convert to i64.

34360

// This store is not atomic and doesn't need to be.

34361

// FIXME: We don't need a stack temporary if the result of the load

34362

// is already being stored. We could just directly store there.

34363

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

34364

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

34365

MachinePointerInfo MPI =

34366

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

34367

SDValue StoreOps[] = { Chain, Result, StackPtr };

34368

Chain = DAG.getMemIntrinsicNode(

34369

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

34370

MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);

34371

34372

// Finally load the value back from the stack temporary and return it.

34373

// This load is not atomic and doesn't need to be.

34374

// This load will be further type legalized.

34375

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

34376

Results.push_back(Result);

34377

Results.push_back(Result.getValue(1));

34378

return;

34379

}

34380

}

34381

// TODO: Use MOVLPS when SSE1 is available?

34382

// Delegate to generic TypeLegalization. Situations we can really handle

34383

// should have already been dealt with by AtomicExpandPass.cpp.

34384

break;

34385

}

34386

case ISD::ATOMIC_SWAP:

34387

case ISD::ATOMIC_LOAD_ADD:

34388

case ISD::ATOMIC_LOAD_SUB:

34389

case ISD::ATOMIC_LOAD_AND:

34390

case ISD::ATOMIC_LOAD_OR:

34391

case ISD::ATOMIC_LOAD_XOR:

34392

case ISD::ATOMIC_LOAD_NAND:

34393

case ISD::ATOMIC_LOAD_MIN:

34394

case ISD::ATOMIC_LOAD_MAX:

34395

case ISD::ATOMIC_LOAD_UMIN:

34396

case ISD::ATOMIC_LOAD_UMAX:

34397

// Delegate to generic TypeLegalization. Situations we can really handle

34398

// should have already been dealt with by AtomicExpandPass.cpp.

34399

break;

34400

34401

case ISD::BITCAST: {

34402

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34402, __extension__
__PRETTY_FUNCTION__));

34403

EVT DstVT = N->getValueType(0);

34404

EVT SrcVT = N->getOperand(0).getValueType();

34405

34406

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

34407

// we can split using the k-register rather than memory.

34408

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

34409

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34409, __extension__
__PRETTY_FUNCTION__));

34410

SDValue Lo, Hi;

34411

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

34412

Lo = DAG.getBitcast(MVT::i32, Lo);

34413

Hi = DAG.getBitcast(MVT::i32, Hi);

34414

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

34415

Results.push_back(Res);

34416

return;

34417

}

34418

34419

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

34420

// FIXME: Use v4f32 for SSE1?

34421

assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34421, __extension__
__PRETTY_FUNCTION__));

34422

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34423, __extension__
__PRETTY_FUNCTION__))

34423

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34423, __extension__
__PRETTY_FUNCTION__));

34424

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

34425

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

34426

N->getOperand(0));

34427

Res = DAG.getBitcast(WideVT, Res);

34428

Results.push_back(Res);

34429

return;

34430

}

34431

34432

return;

34433

}

34434

case ISD::MGATHER: {

34435

EVT VT = N->getValueType(0);

34436

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

34437

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

34438

auto *Gather = cast<MaskedGatherSDNode>(N);

34439

SDValue Index = Gather->getIndex();

34440

if (Index.getValueType() != MVT::v2i64)

34441

return;

34442

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34443, __extension__
__PRETTY_FUNCTION__))

34443

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34443, __extension__
__PRETTY_FUNCTION__));

34444

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34445

SDValue Mask = Gather->getMask();

34446

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34446, __extension__
__PRETTY_FUNCTION__));

34447

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

34448

Gather->getPassThru(),

34449

DAG.getUNDEF(VT));

34450

if (!Subtarget.hasVLX()) {

34451

// We need to widen the mask, but the instruction will only use 2

34452

// of its elements. So we can use undef.

34453

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

34454

DAG.getUNDEF(MVT::v2i1));

34455

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

34456

}

34457

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

34458

Gather->getBasePtr(), Index, Gather->getScale() };

34459

SDValue Res = DAG.getMemIntrinsicNode(

34460

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

34461

Gather->getMemoryVT(), Gather->getMemOperand());

34462

Results.push_back(Res);

34463

Results.push_back(Res.getValue(1));

34464

return;

34465

}

34466

return;

34467

}

34468

case ISD::LOAD: {

34469

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

34470

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

34471

// cast since type legalization will try to use an i64 load.

34472

MVT VT = N->getSimpleValueType(0);

34473

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34473, __extension__
__PRETTY_FUNCTION__));

34474

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34475, __extension__
__PRETTY_FUNCTION__))

34475

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34475, __extension__
__PRETTY_FUNCTION__));

34476

if (!ISD::isNON_EXTLoad(N))

34477

return;

34478

auto *Ld = cast<LoadSDNode>(N);

34479

if (Subtarget.hasSSE2()) {

34480

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

34481

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

34482

Ld->getPointerInfo(), Ld->getOriginalAlign(),

34483

Ld->getMemOperand()->getFlags());

34484

SDValue Chain = Res.getValue(1);

34485

MVT VecVT = MVT::getVectorVT(LdVT, 2);

34486

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

34487

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34488

Res = DAG.getBitcast(WideVT, Res);

34489

Results.push_back(Res);

34490

Results.push_back(Chain);

34491

return;

34492

}

34493

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34493, __extension__
__PRETTY_FUNCTION__));

34494

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

34495

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

34496

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34497

MVT::i64, Ld->getMemOperand());

34498

Results.push_back(Res);

34499

Results.push_back(Res.getValue(1));

34500

return;

34501

}

34502

case ISD::ADDRSPACECAST: {

34503

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

34504

Results.push_back(V);

34505

return;

34506

}

34507

case ISD::BITREVERSE: {

34508

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34508, __extension__
__PRETTY_FUNCTION__));

34509

assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34509, __extension__
__PRETTY_FUNCTION__));

34510

// We can use VPPERM by copying to a vector register and back. We'll need

34511

// to move the scalar in two i32 pieces.

34512

Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

34513

return;

34514

}

34515

case ISD::EXTRACT_VECTOR_ELT: {

34516

// f16 = extract vXf16 %vec, i64 %idx

34517

assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34518, __extension__
__PRETTY_FUNCTION__))

34518

"Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34518, __extension__
__PRETTY_FUNCTION__));

34519

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34519, __extension__
__PRETTY_FUNCTION__));

34520

SDValue VecOp = N->getOperand(0);

34521

EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();

34522

SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));

34523

Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,

34524

N->getOperand(1));

34525

Split = DAG.getBitcast(MVT::f16, Split);

34526

Results.push_back(Split);

34527

return;

34528

}

34529

}

34530

}

34531

34532

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

34533

switch ((X86ISD::NodeType)Opcode) {

34534

case X86ISD::FIRST_NUMBER: break;

34535

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

34536

NODE_NAME_CASE(BSF)

34537

NODE_NAME_CASE(BSR)

34538

NODE_NAME_CASE(FSHL)

34539

NODE_NAME_CASE(FSHR)

34540

NODE_NAME_CASE(FAND)

34541

NODE_NAME_CASE(FANDN)

34542

NODE_NAME_CASE(FOR)

34543

NODE_NAME_CASE(FXOR)

34544

NODE_NAME_CASE(FILD)

34545

NODE_NAME_CASE(FIST)

34546

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

34547

NODE_NAME_CASE(FLD)

34548

NODE_NAME_CASE(FST)

34549

NODE_NAME_CASE(CALL)

34550

NODE_NAME_CASE(CALL_RVMARKER)

34551

NODE_NAME_CASE(BT)

34552

NODE_NAME_CASE(CMP)

34553

NODE_NAME_CASE(FCMP)

34554

NODE_NAME_CASE(STRICT_FCMP)

34555

NODE_NAME_CASE(STRICT_FCMPS)

34556

NODE_NAME_CASE(COMI)

34557

NODE_NAME_CASE(UCOMI)

34558

NODE_NAME_CASE(CMPM)

34559

NODE_NAME_CASE(CMPMM)

34560

NODE_NAME_CASE(STRICT_CMPM)

34561

NODE_NAME_CASE(CMPMM_SAE)

34562

NODE_NAME_CASE(SETCC)

34563

NODE_NAME_CASE(SETCC_CARRY)

34564

NODE_NAME_CASE(FSETCC)

34565

NODE_NAME_CASE(FSETCCM)

34566

NODE_NAME_CASE(FSETCCM_SAE)

34567

NODE_NAME_CASE(CMOV)

34568

NODE_NAME_CASE(BRCOND)

34569

NODE_NAME_CASE(RET_FLAG)

34570

NODE_NAME_CASE(IRET)

34571

NODE_NAME_CASE(REP_STOS)

34572

NODE_NAME_CASE(REP_MOVS)

34573

NODE_NAME_CASE(GlobalBaseReg)

34574

NODE_NAME_CASE(Wrapper)

34575

NODE_NAME_CASE(WrapperRIP)

34576

NODE_NAME_CASE(MOVQ2DQ)

34577

NODE_NAME_CASE(MOVDQ2Q)

34578

NODE_NAME_CASE(MMX_MOVD2W)

34579

NODE_NAME_CASE(MMX_MOVW2D)

34580

NODE_NAME_CASE(PEXTRB)

34581

NODE_NAME_CASE(PEXTRW)

34582

NODE_NAME_CASE(INSERTPS)

34583

NODE_NAME_CASE(PINSRB)

34584

NODE_NAME_CASE(PINSRW)

34585

NODE_NAME_CASE(PSHUFB)

34586

NODE_NAME_CASE(ANDNP)

34587

NODE_NAME_CASE(BLENDI)

34588

NODE_NAME_CASE(BLENDV)

34589

NODE_NAME_CASE(HADD)

34590

NODE_NAME_CASE(HSUB)

34591

NODE_NAME_CASE(FHADD)

34592

NODE_NAME_CASE(FHSUB)

34593

NODE_NAME_CASE(CONFLICT)

34594

NODE_NAME_CASE(FMAX)

34595

NODE_NAME_CASE(FMAXS)

34596

NODE_NAME_CASE(FMAX_SAE)

34597

NODE_NAME_CASE(FMAXS_SAE)

34598

NODE_NAME_CASE(FMIN)

34599

NODE_NAME_CASE(FMINS)

34600

NODE_NAME_CASE(FMIN_SAE)

34601

NODE_NAME_CASE(FMINS_SAE)

34602

NODE_NAME_CASE(FMAXC)

34603

NODE_NAME_CASE(FMINC)

34604

NODE_NAME_CASE(FRSQRT)

34605

NODE_NAME_CASE(FRCP)

34606

NODE_NAME_CASE(EXTRQI)

34607

NODE_NAME_CASE(INSERTQI)

34608

NODE_NAME_CASE(TLSADDR)

34609

NODE_NAME_CASE(TLSBASEADDR)

34610

NODE_NAME_CASE(TLSCALL)

34611

NODE_NAME_CASE(EH_SJLJ_SETJMP)

34612

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

34613

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

34614

NODE_NAME_CASE(EH_RETURN)

34615

NODE_NAME_CASE(TC_RETURN)

34616

NODE_NAME_CASE(FNSTCW16m)

34617

NODE_NAME_CASE(FLDCW16m)

34618

NODE_NAME_CASE(LCMPXCHG_DAG)

34619

NODE_NAME_CASE(LCMPXCHG8_DAG)

34620

NODE_NAME_CASE(LCMPXCHG16_DAG)

34621

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

34622

NODE_NAME_CASE(LADD)

34623

NODE_NAME_CASE(LSUB)

34624

NODE_NAME_CASE(LOR)

34625

NODE_NAME_CASE(LXOR)

34626

NODE_NAME_CASE(LAND)

34627

NODE_NAME_CASE(LBTS)

34628

NODE_NAME_CASE(LBTC)

34629

NODE_NAME_CASE(LBTR)

34630

NODE_NAME_CASE(LBTS_RM)

34631

NODE_NAME_CASE(LBTC_RM)

34632

NODE_NAME_CASE(LBTR_RM)

34633

NODE_NAME_CASE(AADD)

34634

NODE_NAME_CASE(AOR)

34635

NODE_NAME_CASE(AXOR)

34636

NODE_NAME_CASE(AAND)

34637

NODE_NAME_CASE(VZEXT_MOVL)

34638

NODE_NAME_CASE(VZEXT_LOAD)

34639

NODE_NAME_CASE(VEXTRACT_STORE)

34640

NODE_NAME_CASE(VTRUNC)

34641

NODE_NAME_CASE(VTRUNCS)

34642

NODE_NAME_CASE(VTRUNCUS)

34643

NODE_NAME_CASE(VMTRUNC)

34644

NODE_NAME_CASE(VMTRUNCS)

34645

NODE_NAME_CASE(VMTRUNCUS)

34646

NODE_NAME_CASE(VTRUNCSTORES)

34647

NODE_NAME_CASE(VTRUNCSTOREUS)

34648

NODE_NAME_CASE(VMTRUNCSTORES)

34649

NODE_NAME_CASE(VMTRUNCSTOREUS)

34650

NODE_NAME_CASE(VFPEXT)

34651

NODE_NAME_CASE(STRICT_VFPEXT)

34652

NODE_NAME_CASE(VFPEXT_SAE)

34653

NODE_NAME_CASE(VFPEXTS)

34654

NODE_NAME_CASE(VFPEXTS_SAE)

34655

NODE_NAME_CASE(VFPROUND)

34656

NODE_NAME_CASE(STRICT_VFPROUND)

34657

NODE_NAME_CASE(VMFPROUND)

34658

NODE_NAME_CASE(VFPROUND_RND)

34659

NODE_NAME_CASE(VFPROUNDS)

34660

NODE_NAME_CASE(VFPROUNDS_RND)

34661

NODE_NAME_CASE(VSHLDQ)

34662

NODE_NAME_CASE(VSRLDQ)

34663

NODE_NAME_CASE(VSHL)

34664

NODE_NAME_CASE(VSRL)

34665

NODE_NAME_CASE(VSRA)

34666

NODE_NAME_CASE(VSHLI)

34667

NODE_NAME_CASE(VSRLI)

34668

NODE_NAME_CASE(VSRAI)

34669

NODE_NAME_CASE(VSHLV)

34670

NODE_NAME_CASE(VSRLV)

34671

NODE_NAME_CASE(VSRAV)

34672

NODE_NAME_CASE(VROTLI)

34673

NODE_NAME_CASE(VROTRI)

34674

NODE_NAME_CASE(VPPERM)

34675

NODE_NAME_CASE(CMPP)

34676

NODE_NAME_CASE(STRICT_CMPP)

34677

NODE_NAME_CASE(PCMPEQ)

34678

NODE_NAME_CASE(PCMPGT)

34679

NODE_NAME_CASE(PHMINPOS)

34680

NODE_NAME_CASE(ADD)

34681

NODE_NAME_CASE(SUB)

34682

NODE_NAME_CASE(ADC)

34683

NODE_NAME_CASE(SBB)

34684

NODE_NAME_CASE(SMUL)

34685

NODE_NAME_CASE(UMUL)

34686

NODE_NAME_CASE(OR)

34687

NODE_NAME_CASE(XOR)

34688

NODE_NAME_CASE(AND)

34689

NODE_NAME_CASE(BEXTR)

34690

NODE_NAME_CASE(BEXTRI)

34691

NODE_NAME_CASE(BZHI)

34692

NODE_NAME_CASE(PDEP)

34693

NODE_NAME_CASE(PEXT)

34694

NODE_NAME_CASE(MUL_IMM)

34695

NODE_NAME_CASE(MOVMSK)

34696

NODE_NAME_CASE(PTEST)

34697

NODE_NAME_CASE(TESTP)

34698

NODE_NAME_CASE(KORTEST)

34699

NODE_NAME_CASE(KTEST)

34700

NODE_NAME_CASE(KADD)

34701

NODE_NAME_CASE(KSHIFTL)

34702

NODE_NAME_CASE(KSHIFTR)

34703

NODE_NAME_CASE(PACKSS)

34704

NODE_NAME_CASE(PACKUS)

34705

NODE_NAME_CASE(PALIGNR)

34706

NODE_NAME_CASE(VALIGN)

34707

NODE_NAME_CASE(VSHLD)

34708

NODE_NAME_CASE(VSHRD)

34709

NODE_NAME_CASE(VSHLDV)

34710

NODE_NAME_CASE(VSHRDV)

34711

NODE_NAME_CASE(PSHUFD)

34712

NODE_NAME_CASE(PSHUFHW)

34713

NODE_NAME_CASE(PSHUFLW)

34714

NODE_NAME_CASE(SHUFP)

34715

NODE_NAME_CASE(SHUF128)

34716

NODE_NAME_CASE(MOVLHPS)

34717

NODE_NAME_CASE(MOVHLPS)

34718

NODE_NAME_CASE(MOVDDUP)

34719

NODE_NAME_CASE(MOVSHDUP)

34720

NODE_NAME_CASE(MOVSLDUP)

34721

NODE_NAME_CASE(MOVSD)

34722

NODE_NAME_CASE(MOVSS)

34723

NODE_NAME_CASE(MOVSH)

34724

NODE_NAME_CASE(UNPCKL)

34725

NODE_NAME_CASE(UNPCKH)

34726

NODE_NAME_CASE(VBROADCAST)

34727

NODE_NAME_CASE(VBROADCAST_LOAD)

34728

NODE_NAME_CASE(VBROADCASTM)

34729

NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

34730

NODE_NAME_CASE(VPERMILPV)

34731

NODE_NAME_CASE(VPERMILPI)

34732

NODE_NAME_CASE(VPERM2X128)

34733

NODE_NAME_CASE(VPERMV)

34734

NODE_NAME_CASE(VPERMV3)

34735

NODE_NAME_CASE(VPERMI)

34736

NODE_NAME_CASE(VPTERNLOG)

34737

NODE_NAME_CASE(VFIXUPIMM)

34738

NODE_NAME_CASE(VFIXUPIMM_SAE)

34739

NODE_NAME_CASE(VFIXUPIMMS)

34740

NODE_NAME_CASE(VFIXUPIMMS_SAE)

34741

NODE_NAME_CASE(VRANGE)

34742

NODE_NAME_CASE(VRANGE_SAE)

34743

NODE_NAME_CASE(VRANGES)

34744

NODE_NAME_CASE(VRANGES_SAE)

34745

NODE_NAME_CASE(PMULUDQ)

34746

NODE_NAME_CASE(PMULDQ)

34747

NODE_NAME_CASE(PSADBW)

34748

NODE_NAME_CASE(DBPSADBW)

34749

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

34750

NODE_NAME_CASE(VAARG_64)

34751

NODE_NAME_CASE(VAARG_X32)

34752

NODE_NAME_CASE(DYN_ALLOCA)

34753

NODE_NAME_CASE(MFENCE)

34754

NODE_NAME_CASE(SEG_ALLOCA)

34755

NODE_NAME_CASE(PROBED_ALLOCA)

34756

NODE_NAME_CASE(RDRAND)

34757

NODE_NAME_CASE(RDSEED)

34758

NODE_NAME_CASE(RDPKRU)

34759

NODE_NAME_CASE(WRPKRU)

34760

NODE_NAME_CASE(VPMADDUBSW)

34761

NODE_NAME_CASE(VPMADDWD)

34762

NODE_NAME_CASE(VPSHA)

34763

NODE_NAME_CASE(VPSHL)

34764

NODE_NAME_CASE(VPCOM)

34765

NODE_NAME_CASE(VPCOMU)

34766

NODE_NAME_CASE(VPERMIL2)

34767

NODE_NAME_CASE(FMSUB)

34768

NODE_NAME_CASE(STRICT_FMSUB)

34769

NODE_NAME_CASE(FNMADD)

34770

NODE_NAME_CASE(STRICT_FNMADD)

34771

NODE_NAME_CASE(FNMSUB)

34772

NODE_NAME_CASE(STRICT_FNMSUB)

34773

NODE_NAME_CASE(FMADDSUB)

34774

NODE_NAME_CASE(FMSUBADD)

34775

NODE_NAME_CASE(FMADD_RND)

34776

NODE_NAME_CASE(FNMADD_RND)

34777

NODE_NAME_CASE(FMSUB_RND)

34778

NODE_NAME_CASE(FNMSUB_RND)

34779

NODE_NAME_CASE(FMADDSUB_RND)

34780

NODE_NAME_CASE(FMSUBADD_RND)

34781

NODE_NAME_CASE(VFMADDC)

34782

NODE_NAME_CASE(VFMADDC_RND)

34783

NODE_NAME_CASE(VFCMADDC)

34784

NODE_NAME_CASE(VFCMADDC_RND)

34785

NODE_NAME_CASE(VFMULC)

34786

NODE_NAME_CASE(VFMULC_RND)

34787

NODE_NAME_CASE(VFCMULC)

34788

NODE_NAME_CASE(VFCMULC_RND)

34789

NODE_NAME_CASE(VFMULCSH)

34790

NODE_NAME_CASE(VFMULCSH_RND)

34791

NODE_NAME_CASE(VFCMULCSH)

34792

NODE_NAME_CASE(VFCMULCSH_RND)

34793

NODE_NAME_CASE(VFMADDCSH)

34794

NODE_NAME_CASE(VFMADDCSH_RND)

34795

NODE_NAME_CASE(VFCMADDCSH)

34796

NODE_NAME_CASE(VFCMADDCSH_RND)

34797

NODE_NAME_CASE(VPMADD52H)

34798

NODE_NAME_CASE(VPMADD52L)

34799

NODE_NAME_CASE(VRNDSCALE)

34800

NODE_NAME_CASE(STRICT_VRNDSCALE)

34801

NODE_NAME_CASE(VRNDSCALE_SAE)

34802

NODE_NAME_CASE(VRNDSCALES)

34803

NODE_NAME_CASE(VRNDSCALES_SAE)

34804

NODE_NAME_CASE(VREDUCE)

34805

NODE_NAME_CASE(VREDUCE_SAE)

34806

NODE_NAME_CASE(VREDUCES)

34807

NODE_NAME_CASE(VREDUCES_SAE)

34808

NODE_NAME_CASE(VGETMANT)

34809

NODE_NAME_CASE(VGETMANT_SAE)

34810

NODE_NAME_CASE(VGETMANTS)

34811

NODE_NAME_CASE(VGETMANTS_SAE)

34812

NODE_NAME_CASE(PCMPESTR)

34813

NODE_NAME_CASE(PCMPISTR)

34814

NODE_NAME_CASE(XTEST)

34815

NODE_NAME_CASE(COMPRESS)

34816

NODE_NAME_CASE(EXPAND)

34817

NODE_NAME_CASE(SELECTS)

34818

NODE_NAME_CASE(ADDSUB)

34819

NODE_NAME_CASE(RCP14)

34820

NODE_NAME_CASE(RCP14S)

34821

NODE_NAME_CASE(RCP28)

34822

NODE_NAME_CASE(RCP28_SAE)

34823

NODE_NAME_CASE(RCP28S)

34824

NODE_NAME_CASE(RCP28S_SAE)

34825

NODE_NAME_CASE(EXP2)

34826

NODE_NAME_CASE(EXP2_SAE)

34827

NODE_NAME_CASE(RSQRT14)

34828

NODE_NAME_CASE(RSQRT14S)

34829

NODE_NAME_CASE(RSQRT28)

34830

NODE_NAME_CASE(RSQRT28_SAE)

34831

NODE_NAME_CASE(RSQRT28S)

34832

NODE_NAME_CASE(RSQRT28S_SAE)

34833

NODE_NAME_CASE(FADD_RND)

34834

NODE_NAME_CASE(FADDS)

34835

NODE_NAME_CASE(FADDS_RND)

34836

NODE_NAME_CASE(FSUB_RND)

34837

NODE_NAME_CASE(FSUBS)

34838

NODE_NAME_CASE(FSUBS_RND)

34839

NODE_NAME_CASE(FMUL_RND)

34840

NODE_NAME_CASE(FMULS)

34841

NODE_NAME_CASE(FMULS_RND)

34842

NODE_NAME_CASE(FDIV_RND)

34843

NODE_NAME_CASE(FDIVS)

34844

NODE_NAME_CASE(FDIVS_RND)

34845

NODE_NAME_CASE(FSQRT_RND)

34846

NODE_NAME_CASE(FSQRTS)

34847

NODE_NAME_CASE(FSQRTS_RND)

34848

NODE_NAME_CASE(FGETEXP)

34849

NODE_NAME_CASE(FGETEXP_SAE)

34850

NODE_NAME_CASE(FGETEXPS)

34851

NODE_NAME_CASE(FGETEXPS_SAE)

34852

NODE_NAME_CASE(SCALEF)

34853

NODE_NAME_CASE(SCALEF_RND)

34854

NODE_NAME_CASE(SCALEFS)

34855

NODE_NAME_CASE(SCALEFS_RND)

34856

NODE_NAME_CASE(MULHRS)

34857

NODE_NAME_CASE(SINT_TO_FP_RND)

34858

NODE_NAME_CASE(UINT_TO_FP_RND)

34859

NODE_NAME_CASE(CVTTP2SI)

34860

NODE_NAME_CASE(CVTTP2UI)

34861

NODE_NAME_CASE(STRICT_CVTTP2SI)

34862

NODE_NAME_CASE(STRICT_CVTTP2UI)

34863

NODE_NAME_CASE(MCVTTP2SI)

34864

NODE_NAME_CASE(MCVTTP2UI)

34865

NODE_NAME_CASE(CVTTP2SI_SAE)

34866

NODE_NAME_CASE(CVTTP2UI_SAE)

34867

NODE_NAME_CASE(CVTTS2SI)

34868

NODE_NAME_CASE(CVTTS2UI)

34869

NODE_NAME_CASE(CVTTS2SI_SAE)

34870

NODE_NAME_CASE(CVTTS2UI_SAE)

34871

NODE_NAME_CASE(CVTSI2P)

34872

NODE_NAME_CASE(CVTUI2P)

34873

NODE_NAME_CASE(STRICT_CVTSI2P)

34874

NODE_NAME_CASE(STRICT_CVTUI2P)

34875

NODE_NAME_CASE(MCVTSI2P)

34876

NODE_NAME_CASE(MCVTUI2P)

34877

NODE_NAME_CASE(VFPCLASS)

34878

NODE_NAME_CASE(VFPCLASSS)

34879

NODE_NAME_CASE(MULTISHIFT)

34880

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

34881

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

34882

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

34883

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

34884

NODE_NAME_CASE(CVTPS2PH)

34885

NODE_NAME_CASE(STRICT_CVTPS2PH)

34886

NODE_NAME_CASE(CVTPS2PH_SAE)

34887

NODE_NAME_CASE(MCVTPS2PH)

34888

NODE_NAME_CASE(MCVTPS2PH_SAE)

34889

NODE_NAME_CASE(CVTPH2PS)

34890

NODE_NAME_CASE(STRICT_CVTPH2PS)

34891

NODE_NAME_CASE(CVTPH2PS_SAE)

34892

NODE_NAME_CASE(CVTP2SI)

34893

NODE_NAME_CASE(CVTP2UI)

34894

NODE_NAME_CASE(MCVTP2SI)

34895

NODE_NAME_CASE(MCVTP2UI)

34896

NODE_NAME_CASE(CVTP2SI_RND)

34897

NODE_NAME_CASE(CVTP2UI_RND)

34898

NODE_NAME_CASE(CVTS2SI)

34899

NODE_NAME_CASE(CVTS2UI)

34900

NODE_NAME_CASE(CVTS2SI_RND)

34901

NODE_NAME_CASE(CVTS2UI_RND)

34902

NODE_NAME_CASE(CVTNE2PS2BF16)

34903

NODE_NAME_CASE(CVTNEPS2BF16)

34904

NODE_NAME_CASE(MCVTNEPS2BF16)

34905

NODE_NAME_CASE(DPBF16PS)

34906

NODE_NAME_CASE(LWPINS)

34907

NODE_NAME_CASE(MGATHER)

34908

NODE_NAME_CASE(MSCATTER)

34909

NODE_NAME_CASE(VPDPBUSD)

34910

NODE_NAME_CASE(VPDPBUSDS)

34911

NODE_NAME_CASE(VPDPWSSD)

34912

NODE_NAME_CASE(VPDPWSSDS)

34913

NODE_NAME_CASE(VPSHUFBITQMB)

34914

NODE_NAME_CASE(GF2P8MULB)

34915

NODE_NAME_CASE(GF2P8AFFINEQB)

34916

NODE_NAME_CASE(GF2P8AFFINEINVQB)

34917

NODE_NAME_CASE(NT_CALL)

34918

NODE_NAME_CASE(NT_BRIND)

34919

NODE_NAME_CASE(UMWAIT)

34920

NODE_NAME_CASE(TPAUSE)

34921

NODE_NAME_CASE(ENQCMD)

34922

NODE_NAME_CASE(ENQCMDS)

34923

NODE_NAME_CASE(VP2INTERSECT)

34924

NODE_NAME_CASE(VPDPBSUD)

34925

NODE_NAME_CASE(VPDPBSUDS)

34926

NODE_NAME_CASE(VPDPBUUD)

34927

NODE_NAME_CASE(VPDPBUUDS)

34928

NODE_NAME_CASE(VPDPBSSD)

34929

NODE_NAME_CASE(VPDPBSSDS)

34930

NODE_NAME_CASE(AESENC128KL)

34931

NODE_NAME_CASE(AESDEC128KL)

34932

NODE_NAME_CASE(AESENC256KL)

34933

NODE_NAME_CASE(AESDEC256KL)

34934

NODE_NAME_CASE(AESENCWIDE128KL)

34935

NODE_NAME_CASE(AESDECWIDE128KL)

34936

NODE_NAME_CASE(AESENCWIDE256KL)

34937

NODE_NAME_CASE(AESDECWIDE256KL)

34938

NODE_NAME_CASE(CMPCCXADD)

34939

NODE_NAME_CASE(TESTUI)

34940

NODE_NAME_CASE(FP80_ADD)

34941

NODE_NAME_CASE(STRICT_FP80_ADD)

34942

}

34943

return nullptr;

34944

#undef NODE_NAME_CASE

34945

}

34946

34947

/// Return true if the addressing mode represented by AM is legal for this

34948

/// target, for a load/store of the specified type.

34949

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

34950

const AddrMode &AM, Type *Ty,

34951

unsigned AS,

34952

Instruction *I) const {

34953

// X86 supports extremely general addressing modes.

34954

CodeModel::Model M = getTargetMachine().getCodeModel();

34955

34956

// X86 allows a sign-extended 32-bit immediate field as a displacement.

34957

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

34958

return false;

34959

34960

if (AM.BaseGV) {

34961

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

34962

34963

// If a reference to this global requires an extra load, we can't fold it.

34964

if (isGlobalStubReference(GVFlags))

34965

return false;

34966

34967

// If BaseGV requires a register for the PIC base, we cannot also have a

34968

// BaseReg specified.

34969

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

34970

return false;

34971

34972

// If lower 4G is not available, then we must use rip-relative addressing.

34973

if ((M != CodeModel::Small || isPositionIndependent()) &&

34974

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

34975

return false;

34976

}

34977

34978

switch (AM.Scale) {

34979

case 0:

34980

case 1:

34981

case 2:

34982

case 4:

34983

case 8:

34984

// These scales always work.

34985

break;

34986

case 3:

34987

case 5:

34988

case 9:

34989

// These scales are formed with basereg+scalereg. Only accept if there is

34990

// no basereg yet.

34991

if (AM.HasBaseReg)

34992

return false;

34993

break;

34994

default: // Other stuff never works.

34995

return false;

34996

}

34997

34998

return true;

34999

}

35000

35001

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

35002

unsigned Bits = Ty->getScalarSizeInBits();

35003

35004

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

35005

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

35006

if (Subtarget.hasXOP() &&

35007

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

35008

return false;

35009

35010

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

35011

// shifts just as cheap as scalar ones.

35012

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

35013

return false;

35014

35015

// AVX512BW has shifts such as vpsllvw.

35016

if (Subtarget.hasBWI() && Bits == 16)

35017

return false;

35018

35019

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

35020

// fully general vector.

35021

return true;

35022

}

35023

35024

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

35025

switch (Opcode) {

35026

// These are non-commutative binops.

35027

// TODO: Add more X86ISD opcodes once we have test coverage.

35028

case X86ISD::ANDNP:

35029

case X86ISD::PCMPGT:

35030

case X86ISD::FMAX:

35031

case X86ISD::FMIN:

35032

case X86ISD::FANDN:

35033

case X86ISD::VPSHA:

35034

case X86ISD::VPSHL:

35035

case X86ISD::VSHLV:

35036

case X86ISD::VSRLV:

35037

case X86ISD::VSRAV:

35038

return true;

35039

}

35040

35041

return TargetLoweringBase::isBinOp(Opcode);

35042

}

35043

35044

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

35045

switch (Opcode) {

35046

// TODO: Add more X86ISD opcodes once we have test coverage.

35047

case X86ISD::PCMPEQ:

35048

case X86ISD::PMULDQ:

35049

case X86ISD::PMULUDQ:

35050

case X86ISD::FMAXC:

35051

case X86ISD::FMINC:

35052

case X86ISD::FAND:

35053

case X86ISD::FOR:

35054

case X86ISD::FXOR:

35055

return true;

35056

}

35057

35058

return TargetLoweringBase::isCommutativeBinOp(Opcode);

35059

}

35060

35061

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

35062

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35063

return false;

35064

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

35065

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

35066

return NumBits1 > NumBits2;

35067

}

35068

35069

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

35070

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35071

return false;

35072

35073

if (!isTypeLegal(EVT::getEVT(Ty1)))

35074

return false;

35075

35076

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35076, __extension__
__PRETTY_FUNCTION__));

35077

35078

// Assuming the caller doesn't have a zeroext or signext return parameter,

35079

// truncation all the way down to i1 is valid.

35080

return true;

35081

}

35082

35083

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

35084

return isInt<32>(Imm);

35085

}

35086

35087

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

35088

// Can also use sub to handle negated immediates.

35089

return isInt<32>(Imm);

35090

}

35091

35092

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

35093

return isInt<32>(Imm);

35094

}

35095

35096

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

35097

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

35098

return false;

35099

unsigned NumBits1 = VT1.getSizeInBits();

35100

unsigned NumBits2 = VT2.getSizeInBits();

35101

return NumBits1 > NumBits2;

35102

}

35103

35104

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

35105

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35106

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

35107

}

35108

35109

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

35110

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35111

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

35112

}

35113

35114

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

35115

EVT VT1 = Val.getValueType();

35116

if (isZExtFree(VT1, VT2))

35117

return true;

35118

35119

if (Val.getOpcode() != ISD::LOAD)

35120

return false;

35121

35122

if (!VT1.isSimple() || !VT1.isInteger() ||

35123

!VT2.isSimple() || !VT2.isInteger())

35124

return false;

35125

35126

switch (VT1.getSimpleVT().SimpleTy) {

35127

default: break;

35128

case MVT::i8:

35129

case MVT::i16:

35130

case MVT::i32:

35131

// X86 has 8, 16, and 32-bit zero-extending loads.

35132

return true;

35133

}

35134

35135

return false;

35136

}

35137

35138

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

35139

SmallVectorImpl<Use *> &Ops) const {

35140

using namespace llvm::PatternMatch;

35141

35142

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());

35143

if (!VTy)

35144

return false;

35145

35146

if (I->getOpcode() == Instruction::Mul &&

35147

VTy->getElementType()->isIntegerTy(64)) {

35148

for (auto &Op : I->operands()) {

35149

// Make sure we are not already sinking this operand

35150

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

35151

continue;

35152

35153

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or

35154

// the PMULUDQ pattern where the input is a zext_inreg from vXi32.

35155

if (Subtarget.hasSSE41() &&

35156

match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),

35157

m_SpecificInt(32)))) {

35158

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

35159

Ops.push_back(&Op);

35160

} else if (Subtarget.hasSSE2() &&

35161

match(Op.get(),

35162

m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {

35163

Ops.push_back(&Op);

35164

}

35165

}

35166

35167

return !Ops.empty();

35168

}

35169

35170

// A uniform shift amount in a vector shift or funnel shift may be much

35171

// cheaper than a generic variable vector shift, so make that pattern visible

35172

// to SDAG by sinking the shuffle instruction next to the shift.

35173

int ShiftAmountOpNum = -1;

35174

if (I->isShift())

35175

ShiftAmountOpNum = 1;

35176

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

35177

if (II->getIntrinsicID() == Intrinsic::fshl ||

35178

II->getIntrinsicID() == Intrinsic::fshr)

35179

ShiftAmountOpNum = 2;

35180

}

35181

35182

if (ShiftAmountOpNum == -1)

35183

return false;

35184

35185

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

35186

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

35187

isVectorShiftByScalarCheap(I->getType())) {

35188

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

35189

return true;

35190

}

35191

35192

return false;

35193

}

35194

35195

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

35196

if (!Subtarget.is64Bit())

35197

return false;

35198

return TargetLowering::shouldConvertPhiType(From, To);

35199

}

35200

35201

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

35202

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

35203

return false;

35204

35205

EVT SrcVT = ExtVal.getOperand(0).getValueType();

35206

35207

// There is no extending load for vXi1.

35208

if (SrcVT.getScalarType() == MVT::i1)

35209

return false;

35210

35211

return true;

35212

}

35213

35214

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

35215

EVT VT) const {

35216

if (!Subtarget.hasAnyFMA())

35217

return false;

35218

35219

VT = VT.getScalarType();

35220

35221

if (!VT.isSimple())

35222

return false;

35223

35224

switch (VT.getSimpleVT().SimpleTy) {

35225

case MVT::f16:

35226

return Subtarget.hasFP16();

35227

case MVT::f32:

35228

case MVT::f64:

35229

return true;

35230

default:

35231

break;

35232

}

35233

35234

return false;

35235

}

35236

35237

bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

35238

// i16 instructions are longer (0x66 prefix) and potentially slower.

35239

return !(VT1 == MVT::i32 && VT2 == MVT::i16);

35240

}

35241

35242

bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,

35243

EVT VT) const {

35244

// TODO: This is too general. There are cases where pre-AVX512 codegen would

35245

// benefit. The transform may also be profitable for scalar code.

35246

if (!Subtarget.hasAVX512())

35247

return false;

35248

if (!Subtarget.hasVLX() && !VT.is512BitVector())

35249

return false;

35250

if (!VT.isVector() || VT.getScalarType() == MVT::i1)

35251

return false;

35252

35253

return true;

35254

}

35255

35256

/// Targets can use this to indicate that they only support *some*

35257

/// VECTOR_SHUFFLE operations, those with specific masks.

35258

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

35259

/// are assumed to be legal.

35260

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

35261

if (!VT.isSimple())

35262

return false;

35263

35264

// Not for i1 vectors

35265

if (VT.getSimpleVT().getScalarType() == MVT::i1)

35266

return false;

35267

35268

// Very little shuffling can be done for 64-bit vectors right now.

35269

if (VT.getSimpleVT().getSizeInBits() == 64)

35270

return false;

35271

35272

// We only care that the types being shuffled are legal. The lowering can

35273

// handle any possible shuffle mask that results.

35274

return isTypeLegal(VT.getSimpleVT());

35275

}

35276

35277

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

35278

EVT VT) const {

35279

// Don't convert an 'and' into a shuffle that we don't directly support.

35280

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

35281

if (!Subtarget.hasAVX2())

35282

if (VT == MVT::v32i8 || VT == MVT::v16i16)

35283

return false;

35284

35285

// Just delegate to the generic legality, clear masks aren't special.

35286

return isShuffleMaskLegal(Mask, VT);

35287

}

35288

35289

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

35290

// If the subtarget is using thunks, we need to not generate jump tables.

35291

if (Subtarget.useIndirectThunkBranches())

35292

return false;

35293

35294

// Otherwise, fallback on the generic logic.

35295

return TargetLowering::areJTsAllowed(Fn);

35296

}

35297

35298

MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,

35299

EVT ConditionVT) const {

35300

// Avoid 8 and 16 bit types because they increase the chance for unnecessary

35301

// zero-extensions.

35302

if (ConditionVT.getSizeInBits() < 32)

35303

return MVT::i32;

35304

return TargetLoweringBase::getPreferredSwitchConditionType(Context,

35305

ConditionVT);

35306

}

35307

35308

//===----------------------------------------------------------------------===//

35309

// X86 Scheduler Hooks

35310

//===----------------------------------------------------------------------===//

35311

35312

// Returns true if EFLAG is consumed after this iterator in the rest of the

35313

// basic block or any successors of the basic block.

35314

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

35315

MachineBasicBlock *BB) {

35316

// Scan forward through BB for a use/def of EFLAGS.

35317

for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {

35318

if (mi.readsRegister(X86::EFLAGS))

35319

return true;

35320

// If we found a def, we can stop searching.

35321

if (mi.definesRegister(X86::EFLAGS))

35322

return false;

35323

}

35324

35325

// If we hit the end of the block, check whether EFLAGS is live into a

35326

// successor.

35327

for (MachineBasicBlock *Succ : BB->successors())

35328

if (Succ->isLiveIn(X86::EFLAGS))

35329

return true;

35330

35331

return false;

35332

}

35333

35334

/// Utility function to emit xbegin specifying the start of an RTM region.

35335

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

35336

const TargetInstrInfo *TII) {

35337

const DebugLoc &DL = MI.getDebugLoc();

35338

35339

const BasicBlock *BB = MBB->getBasicBlock();

35340

MachineFunction::iterator I = ++MBB->getIterator();

35341

35342

// For the v = xbegin(), we generate

35343

//

35344

// thisMBB:

35345

// xbegin sinkMBB

35346

//

35347

// mainMBB:

35348

// s0 = -1

35349

//

35350

// fallBB:

35351

// eax = # XABORT_DEF

35352

// s1 = eax

35353

//

35354

// sinkMBB:

35355

// v = phi(s0/mainBB, s1/fallBB)

35356

35357

MachineBasicBlock *thisMBB = MBB;

35358

MachineFunction *MF = MBB->getParent();

35359

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

35360

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

35361

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

35362

MF->insert(I, mainMBB);

35363

MF->insert(I, fallMBB);

35364

MF->insert(I, sinkMBB);

35365

35366

if (isEFLAGSLiveAfter(MI, MBB)) {

35367

mainMBB->addLiveIn(X86::EFLAGS);

35368

fallMBB->addLiveIn(X86::EFLAGS);

35369

sinkMBB->addLiveIn(X86::EFLAGS);

35370

}

35371

35372

// Transfer the remainder of BB and its successor edges to sinkMBB.

35373

sinkMBB->splice(sinkMBB->begin(), MBB,

35374

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

35375

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

35376

35377

MachineRegisterInfo &MRI = MF->getRegInfo();

35378

Register DstReg = MI.getOperand(0).getReg();

35379

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

35380

Register mainDstReg = MRI.createVirtualRegister(RC);

35381

Register fallDstReg = MRI.createVirtualRegister(RC);

35382

35383

// thisMBB:

35384

// xbegin fallMBB

35385

// # fallthrough to mainMBB

35386

// # abortion to fallMBB

35387

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

35388

thisMBB->addSuccessor(mainMBB);

35389

thisMBB->addSuccessor(fallMBB);

35390

35391

// mainMBB:

35392

// mainDstReg := -1

35393

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

35394

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

35395

mainMBB->addSuccessor(sinkMBB);

35396

35397

// fallMBB:

35398

// ; pseudo instruction to model hardware's definition from XABORT

35399

// EAX := XABORT_DEF

35400

// fallDstReg := EAX

35401

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

35402

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

35403

.addReg(X86::EAX);

35404

fallMBB->addSuccessor(sinkMBB);

35405

35406

// sinkMBB:

35407

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

35408

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

35409

.addReg(mainDstReg).addMBB(mainMBB)

35410

.addReg(fallDstReg).addMBB(fallMBB);

35411

35412

MI.eraseFromParent();

35413

return sinkMBB;

35414

}

35415

35416

MachineBasicBlock *

35417

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

35418

MachineBasicBlock *MBB) const {

35419

// Emit va_arg instruction on X86-64.

35420

35421

// Operands to this pseudo-instruction:

35422

// 0 ) Output : destination address (reg)

35423

// 1-5) Input : va_list address (addr, i64mem)

35424

// 6 ) ArgSize : Size (in bytes) of vararg type

35425

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

35426

// 8 ) Align : Alignment of type

35427

// 9 ) EFLAGS (implicit-def)

35428

35429

assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35429, __extension__
__PRETTY_FUNCTION__));

35430

static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");

35431

35432

Register DestReg = MI.getOperand(0).getReg();

35433

MachineOperand &Base = MI.getOperand(1);

35434

MachineOperand &Scale = MI.getOperand(2);

35435

MachineOperand &Index = MI.getOperand(3);

35436

MachineOperand &Disp = MI.getOperand(4);

35437

MachineOperand &Segment = MI.getOperand(5);

35438

unsigned ArgSize = MI.getOperand(6).getImm();

35439

unsigned ArgMode = MI.getOperand(7).getImm();

35440

Align Alignment = Align(MI.getOperand(8).getImm());

35441

35442

MachineFunction *MF = MBB->getParent();

35443

35444

// Memory Reference

35445

assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35445, __extension__
__PRETTY_FUNCTION__));

35446

35447

MachineMemOperand *OldMMO = MI.memoperands().front();

35448

35449

// Clone the MMO into two separate MMOs for loading and storing

35450

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

35451

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

35452

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

35453

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

35454

35455

// Machine Information

35456

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35457

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

35458

const TargetRegisterClass *AddrRegClass =

35459

getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

35460

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

35461

const DebugLoc &DL = MI.getDebugLoc();

35462

35463

// struct va_list {

35464

// i32 gp_offset

35465

// i32 fp_offset

35466

// i64 overflow_area (address)

35467

// i64 reg_save_area (address)

35468

// }

35469

// sizeof(va_list) = 24

35470

// alignment(va_list) = 8

35471

35472

unsigned TotalNumIntRegs = 6;

35473

unsigned TotalNumXMMRegs = 8;

35474

bool UseGPOffset = (ArgMode == 1);

35475

bool UseFPOffset = (ArgMode == 2);

35476

unsigned MaxOffset = TotalNumIntRegs * 8 +

35477

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

35478

35479

/* Align ArgSize to a multiple of 8 */

35480

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

35481

bool NeedsAlign = (Alignment > 8);

35482

35483

MachineBasicBlock *thisMBB = MBB;

35484

MachineBasicBlock *overflowMBB;

35485

MachineBasicBlock *offsetMBB;

35486

MachineBasicBlock *endMBB;

35487

35488

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

35489

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

35490

unsigned OffsetReg = 0;

35491

35492

if (!UseGPOffset && !UseFPOffset) {

35493

// If we only pull from the overflow region, we don't create a branch.

35494

// We don't need to alter control flow.

35495

OffsetDestReg = 0; // unused

35496

OverflowDestReg = DestReg;

35497

35498

offsetMBB = nullptr;

35499

overflowMBB = thisMBB;

35500

endMBB = thisMBB;

35501

} else {

35502

// First emit code to check if gp_offset (or fp_offset) is below the bound.

35503

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

35504

// If not, pull from overflow_area. (branch to overflowMBB)

35505

//

35506

// thisMBB

35507

// | .

35508

// | .

35509

// offsetMBB overflowMBB

35510

// | .

35511

// | .

35512

// endMBB

35513

35514

// Registers for the PHI in endMBB

35515

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

35516

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

35517

35518

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

35519

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35520

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35521

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35522

35523

MachineFunction::iterator MBBIter = ++MBB->getIterator();

35524

35525

// Insert the new basic blocks

35526

MF->insert(MBBIter, offsetMBB);

35527

MF->insert(MBBIter, overflowMBB);

35528

MF->insert(MBBIter, endMBB);

35529

35530

// Transfer the remainder of MBB and its successor edges to endMBB.

35531

endMBB->splice(endMBB->begin(), thisMBB,

35532

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

35533

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

35534

35535

// Make offsetMBB and overflowMBB successors of thisMBB

35536

thisMBB->addSuccessor(offsetMBB);

35537

thisMBB->addSuccessor(overflowMBB);

35538

35539

// endMBB is a successor of both offsetMBB and overflowMBB

35540

offsetMBB->addSuccessor(endMBB);

35541

overflowMBB->addSuccessor(endMBB);

35542

35543

// Load the offset value into a register

35544

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

35545

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

35546

.add(Base)

35547

.add(Scale)

35548

.add(Index)

35549

.addDisp(Disp, UseFPOffset ? 4 : 0)

35550

.add(Segment)

35551

.setMemRefs(LoadOnlyMMO);

35552

35553

// Check if there is enough room left to pull this argument.

35554

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

35555

.addReg(OffsetReg)

35556

.addImm(MaxOffset + 8 - ArgSizeA8);

35557

35558

// Branch to "overflowMBB" if offset >= max

35559

// Fall through to "offsetMBB" otherwise

35560

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

35561

.addMBB(overflowMBB).addImm(X86::COND_AE);

35562

}

35563

35564

// In offsetMBB, emit code to use the reg_save_area.

35565

if (offsetMBB) {

35566

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35566, __extension__ __PRETTY_FUNCTION__));

35567

35568

// Read the reg_save_area address.

35569

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

35570

BuildMI(

35571

offsetMBB, DL,

35572

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

35573

RegSaveReg)

35574

.add(Base)

35575

.add(Scale)

35576

.add(Index)

35577

.addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

35578

.add(Segment)

35579

.setMemRefs(LoadOnlyMMO);

35580

35581

if (Subtarget.isTarget64BitLP64()) {

35582

// Zero-extend the offset

35583

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

35584

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

35585

.addImm(0)

35586

.addReg(OffsetReg)

35587

.addImm(X86::sub_32bit);

35588

35589

// Add the offset to the reg_save_area to get the final address.

35590

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

35591

.addReg(OffsetReg64)

35592

.addReg(RegSaveReg);

35593

} else {

35594

// Add the offset to the reg_save_area to get the final address.

35595

BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)

35596

.addReg(OffsetReg)

35597

.addReg(RegSaveReg);

35598

}

35599

35600

// Compute the offset for the next argument

35601

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

35602

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

35603

.addReg(OffsetReg)

35604

.addImm(UseFPOffset ? 16 : 8);

35605

35606

// Store it back into the va_list.

35607

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

35608

.add(Base)

35609

.add(Scale)

35610

.add(Index)

35611

.addDisp(Disp, UseFPOffset ? 4 : 0)

35612

.add(Segment)

35613

.addReg(NextOffsetReg)

35614

.setMemRefs(StoreOnlyMMO);

35615

35616

// Jump to endMBB

35617

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

35618

.addMBB(endMBB);

35619

}

35620

35621

//

35622

// Emit code to use overflow area

35623

//

35624

35625

// Load the overflow_area address into a register.

35626

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

35627

BuildMI(overflowMBB, DL,

35628

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

35629

OverflowAddrReg)

35630

.add(Base)

35631

.add(Scale)

35632

.add(Index)

35633

.addDisp(Disp, 8)

35634

.add(Segment)

35635

.setMemRefs(LoadOnlyMMO);

35636

35637

// If we need to align it, do so. Otherwise, just copy the address

35638

// to OverflowDestReg.

35639

if (NeedsAlign) {

35640

// Align the overflow address

35641

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

35642

35643

// aligned_addr = (addr + (align-1)) & ~(align-1)

35644

BuildMI(

35645

overflowMBB, DL,

35646

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

35647

TmpReg)

35648

.addReg(OverflowAddrReg)

35649

.addImm(Alignment.value() - 1);

35650

35651

BuildMI(

35652

overflowMBB, DL,

35653

TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

35654

OverflowDestReg)

35655

.addReg(TmpReg)

35656

.addImm(~(uint64_t)(Alignment.value() - 1));

35657

} else {

35658

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

35659

.addReg(OverflowAddrReg);

35660

}

35661

35662

// Compute the next overflow address after this argument.

35663

// (the overflow address should be kept 8-byte aligned)

35664

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

35665

BuildMI(

35666

overflowMBB, DL,

35667

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

35668

NextAddrReg)

35669

.addReg(OverflowDestReg)

35670

.addImm(ArgSizeA8);

35671

35672

// Store the new overflow address.

35673

BuildMI(overflowMBB, DL,

35674

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

35675

.add(Base)

35676

.add(Scale)

35677

.add(Index)

35678

.addDisp(Disp, 8)

35679

.add(Segment)

35680

.addReg(NextAddrReg)

35681

.setMemRefs(StoreOnlyMMO);

35682

35683

// If we branched, emit the PHI to the front of endMBB.

35684

if (offsetMBB) {

35685

BuildMI(*endMBB, endMBB->begin(), DL,

35686

TII->get(X86::PHI), DestReg)

35687

.addReg(OffsetDestReg).addMBB(offsetMBB)

35688

.addReg(OverflowDestReg).addMBB(overflowMBB);

35689

}

35690

35691

// Erase the pseudo instruction

35692

MI.eraseFromParent();

35693

35694

return endMBB;

35695

}

35696

35697

// The EFLAGS operand of SelectItr might be missing a kill marker

35698

// because there were multiple uses of EFLAGS, and ISel didn't know

35699

// which to mark. Figure out whether SelectItr should have had a

35700

// kill marker, and set it if it should. Returns the correct kill

35701

// marker value.

35702

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

35703

MachineBasicBlock* BB,

35704

const TargetRegisterInfo* TRI) {

35705

if (isEFLAGSLiveAfter(SelectItr, BB))

35706

return false;

35707

35708

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

35709

// out. SelectMI should have a kill flag on EFLAGS.

35710

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

35711

return true;

35712

}

35713

35714

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

35715

// together with other CMOV pseudo-opcodes into a single basic-block with

35716

// conditional jump around it.

35717

static bool isCMOVPseudo(MachineInstr &MI) {

35718

switch (MI.getOpcode()) {

35719

case X86::CMOV_FR16:

35720

case X86::CMOV_FR16X:

35721

case X86::CMOV_FR32:

35722

case X86::CMOV_FR32X:

35723

case X86::CMOV_FR64:

35724

case X86::CMOV_FR64X:

35725

case X86::CMOV_GR8:

35726

case X86::CMOV_GR16:

35727

case X86::CMOV_GR32:

35728

case X86::CMOV_RFP32:

35729

case X86::CMOV_RFP64:

35730

case X86::CMOV_RFP80:

35731

case X86::CMOV_VR64:

35732

case X86::CMOV_VR128:

35733

case X86::CMOV_VR128X:

35734

case X86::CMOV_VR256:

35735

case X86::CMOV_VR256X:

35736

case X86::CMOV_VR512:

35737

case X86::CMOV_VK1:

35738

case X86::CMOV_VK2:

35739

case X86::CMOV_VK4:

35740

case X86::CMOV_VK8:

35741

case X86::CMOV_VK16:

35742

case X86::CMOV_VK32:

35743

case X86::CMOV_VK64:

35744

return true;

35745

35746

default:

35747

return false;

35748

}

35749

}

35750

35751

// Helper function, which inserts PHI functions into SinkMBB:

35752

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

35753

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

35754

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

35755

// the last PHI function inserted.

35756

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

35757

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

35758

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

35759

MachineBasicBlock *SinkMBB) {

35760

MachineFunction *MF = TrueMBB->getParent();

35761

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

35762

const DebugLoc &DL = MIItBegin->getDebugLoc();

35763

35764

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

35765

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

35766

35767

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

35768

35769

// As we are creating the PHIs, we have to be careful if there is more than

35770

// one. Later CMOVs may reference the results of earlier CMOVs, but later

35771

// PHIs have to reference the individual true/false inputs from earlier PHIs.

35772

// That also means that PHI construction must work forward from earlier to

35773

// later, and that the code must maintain a mapping from earlier PHI's

35774

// destination registers, and the registers that went into the PHI.

35775

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

35776

MachineInstrBuilder MIB;

35777

35778

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

35779

Register DestReg = MIIt->getOperand(0).getReg();

35780

Register Op1Reg = MIIt->getOperand(1).getReg();

35781

Register Op2Reg = MIIt->getOperand(2).getReg();

35782

35783

// If this CMOV we are generating is the opposite condition from

35784

// the jump we generated, then we have to swap the operands for the

35785

// PHI that is going to be generated.

35786

if (MIIt->getOperand(3).getImm() == OppCC)

35787

std::swap(Op1Reg, Op2Reg);

35788

35789

if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())

35790

Op1Reg = RegRewriteTable[Op1Reg].first;

35791

35792

if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())

35793

Op2Reg = RegRewriteTable[Op2Reg].second;

35794

35795

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

35796

.addReg(Op1Reg)

35797

.addMBB(FalseMBB)

35798

.addReg(Op2Reg)

35799

.addMBB(TrueMBB);

35800

35801

// Add this PHI to the rewrite table.

35802

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

35803

}

35804

35805

return MIB;

35806

}

35807

35808

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

35809

MachineBasicBlock *

35810

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

35811

MachineInstr &SecondCascadedCMOV,

35812

MachineBasicBlock *ThisMBB) const {

35813

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35814

const DebugLoc &DL = FirstCMOV.getDebugLoc();

35815

35816

// We lower cascaded CMOVs such as

35817

//

35818

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

35819

//

35820

// to two successive branches.

35821

//

35822

// Without this, we would add a PHI between the two jumps, which ends up

35823

// creating a few copies all around. For instance, for

35824

//

35825

// (sitofp (zext (fcmp une)))

35826

//

35827

// we would generate:

35828

//

35829

// ucomiss %xmm1, %xmm0

35830

// movss <1.0f>, %xmm0

35831

// movaps %xmm0, %xmm1

35832

// jne .LBB5_2

35833

// xorps %xmm1, %xmm1

35834

// .LBB5_2:

35835

// jp .LBB5_4

35836

// movaps %xmm1, %xmm0

35837

// .LBB5_4:

35838

// retq

35839

//

35840

// because this custom-inserter would have generated:

35841

//

35842

// A

35843

// | \

35844

// | B

35845

// | /

35846

// C

35847

// | \

35848

// | D

35849

// | /

35850

// E

35851

//

35852

// A: X = ...; Y = ...

35853

// B: empty

35854

// C: Z = PHI [X, A], [Y, B]

35855

// D: empty

35856

// E: PHI [X, C], [Z, D]

35857

//

35858

// If we lower both CMOVs in a single step, we can instead generate:

35859

//

35860

// A

35861

// | \

35862

// | C

35863

// | /|

35864

// |/ |

35865

// | |

35866

// | D

35867

// | /

35868

// E

35869

//

35870

// A: X = ...; Y = ...

35871

// D: empty

35872

// E: PHI [X, A], [X, C], [Y, D]

35873

//

35874

// Which, in our sitofp/fcmp example, gives us something like:

35875

//

35876

// ucomiss %xmm1, %xmm0

35877

// movss <1.0f>, %xmm0

35878

// jne .LBB5_4

35879

// jp .LBB5_4

35880

// xorps %xmm0, %xmm0

35881

// .LBB5_4:

35882

// retq

35883

//

35884

35885

// We lower cascaded CMOV into two successive branches to the same block.

35886

// EFLAGS is used by both, so mark it as live in the second.

35887

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

35888

MachineFunction *F = ThisMBB->getParent();

35889

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

35890

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

35891

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

35892

35893

MachineFunction::iterator It = ++ThisMBB->getIterator();

35894

F->insert(It, FirstInsertedMBB);

35895

F->insert(It, SecondInsertedMBB);

35896

F->insert(It, SinkMBB);

35897

35898

// For a cascaded CMOV, we lower it to two successive branches to

35899

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

35900

// the FirstInsertedMBB.

35901

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

35902

35903

// If the EFLAGS register isn't dead in the terminator, then claim that it's

35904

// live into the sink and copy blocks.

35905

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

35906

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

35907

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

35908

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

35909

SinkMBB->addLiveIn(X86::EFLAGS);

35910

}

35911

35912

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

35913

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

35914

std::next(MachineBasicBlock::iterator(FirstCMOV)),

35915

ThisMBB->end());

35916

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

35917

35918

// Fallthrough block for ThisMBB.

35919

ThisMBB->addSuccessor(FirstInsertedMBB);

35920

// The true block target of the first branch is always SinkMBB.

35921

ThisMBB->addSuccessor(SinkMBB);

35922

// Fallthrough block for FirstInsertedMBB.

35923

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

35924

// The true block for the branch of FirstInsertedMBB.

35925

FirstInsertedMBB->addSuccessor(SinkMBB);

35926

// This is fallthrough.

35927

SecondInsertedMBB->addSuccessor(SinkMBB);

35928

35929

// Create the conditional branch instructions.

35930

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

35931

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

35932

35933

X86::CondCode SecondCC =

35934

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

35935

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

35936

35937

// SinkMBB:

35938

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

35939

Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();

35940

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

35941

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

35942

MachineInstrBuilder MIB =

35943

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

35944

.addReg(Op1Reg)

35945

.addMBB(SecondInsertedMBB)

35946

.addReg(Op2Reg)

35947

.addMBB(ThisMBB);

35948

35949

// The second SecondInsertedMBB provides the same incoming value as the

35950

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

35951

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

35952

35953

// Now remove the CMOVs.

35954

FirstCMOV.eraseFromParent();

35955

SecondCascadedCMOV.eraseFromParent();

35956

35957

return SinkMBB;

35958

}

35959

35960

MachineBasicBlock *

35961

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

35962

MachineBasicBlock *ThisMBB) const {

35963

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35964

const DebugLoc &DL = MI.getDebugLoc();

35965

35966

// To "insert" a SELECT_CC instruction, we actually have to insert the

35967

// diamond control-flow pattern. The incoming instruction knows the

35968

// destination vreg to set, the condition code register to branch on, the

35969

// true/false values to select between and a branch opcode to use.

35970

35971

// ThisMBB:

35972

// ...

35973

// TrueVal = ...

35974

// cmpTY ccX, r1, r2

35975

// bCC copy1MBB

35976

// fallthrough --> FalseMBB

35977

35978

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

35979

// as described above, by inserting a BB, and then making a PHI at the join

35980

// point to select the true and false operands of the CMOV in the PHI.

35981

//

35982

// The code also handles two different cases of multiple CMOV opcodes

35983

// in a row.

35984

//

35985

// Case 1:

35986

// In this case, there are multiple CMOVs in a row, all which are based on

35987

// the same condition setting (or the exact opposite condition setting).

35988

// In this case we can lower all the CMOVs using a single inserted BB, and

35989

// then make a number of PHIs at the join point to model the CMOVs. The only

35990

// trickiness here, is that in a case like:

35991

//

35992

// t2 = CMOV cond1 t1, f1

35993

// t3 = CMOV cond1 t2, f2

35994

//

35995

// when rewriting this into PHIs, we have to perform some renaming on the

35996

// temps since you cannot have a PHI operand refer to a PHI result earlier

35997

// in the same block. The "simple" but wrong lowering would be:

35998

//

35999

// t2 = PHI t1(BB1), f1(BB2)

36000

// t3 = PHI t2(BB1), f2(BB2)

36001

//

36002

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

36003

// renaming is to note that on the path through BB1, t2 is really just a

36004

// copy of t1, and do that renaming, properly generating:

36005

//

36006

// t2 = PHI t1(BB1), f1(BB2)

36007

// t3 = PHI t1(BB1), f2(BB2)

36008

//

36009

// Case 2:

36010

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

36011

// function - EmitLoweredCascadedSelect.

36012

36013

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

36014

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36015

MachineInstr *LastCMOV = &MI;

36016

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

36017

36018

// Check for case 1, where there are multiple CMOVs with the same condition

36019

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

36020

// number of jumps the most.

36021

36022

if (isCMOVPseudo(MI)) {

36023

// See if we have a string of CMOVS with the same condition. Skip over

36024

// intervening debug insts.

36025

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

36026

(NextMIIt->getOperand(3).getImm() == CC ||

36027

NextMIIt->getOperand(3).getImm() == OppCC)) {

36028

LastCMOV = &*NextMIIt;

36029

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

36030

}

36031

}

36032

36033

// This checks for case 2, but only do this if we didn't already find

36034

// case 1, as indicated by LastCMOV == MI.

36035

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

36036

NextMIIt->getOpcode() == MI.getOpcode() &&

36037

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

36038

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

36039

NextMIIt->getOperand(1).isKill()) {

36040

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

36041

}

36042

36043

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36044

MachineFunction *F = ThisMBB->getParent();

36045

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

36046

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36047

36048

MachineFunction::iterator It = ++ThisMBB->getIterator();

36049

F->insert(It, FalseMBB);

36050

F->insert(It, SinkMBB);

36051

36052

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36053

// live into the sink and copy blocks.

36054

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36055

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

36056

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

36057

FalseMBB->addLiveIn(X86::EFLAGS);

36058

SinkMBB->addLiveIn(X86::EFLAGS);

36059

}

36060

36061

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

36062

auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

36063

MachineBasicBlock::iterator(LastCMOV));

36064

for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

36065

if (MI.isDebugInstr())

36066

SinkMBB->push_back(MI.removeFromParent());

36067

36068

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36069

SinkMBB->splice(SinkMBB->end(), ThisMBB,

36070

std::next(MachineBasicBlock::iterator(LastCMOV)),

36071

ThisMBB->end());

36072

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36073

36074

// Fallthrough block for ThisMBB.

36075

ThisMBB->addSuccessor(FalseMBB);

36076

// The true block target of the first (or only) branch is always a SinkMBB.

36077

ThisMBB->addSuccessor(SinkMBB);

36078

// Fallthrough block for FalseMBB.

36079

FalseMBB->addSuccessor(SinkMBB);

36080

36081

// Create the conditional branch instruction.

36082

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

36083

36084

// SinkMBB:

36085

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

36086

// ...

36087

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

36088

MachineBasicBlock::iterator MIItEnd =

36089

std::next(MachineBasicBlock::iterator(LastCMOV));

36090

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

36091

36092

// Now remove the CMOV(s).

36093

ThisMBB->erase(MIItBegin, MIItEnd);

36094

36095

return SinkMBB;

36096

}

36097

36098

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

36099

if (IsLP64) {

36100

if (isInt<8>(Imm))

36101

return X86::SUB64ri8;

36102

return X86::SUB64ri32;

36103

} else {

36104

if (isInt<8>(Imm))

36105

return X86::SUB32ri8;

36106

return X86::SUB32ri;

36107

}

36108

}

36109

36110

MachineBasicBlock *

36111

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

36112

MachineBasicBlock *MBB) const {

36113

MachineFunction *MF = MBB->getParent();

36114

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36115

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

36116

const DebugLoc &DL = MI.getDebugLoc();

36117

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

36118

36119

const unsigned ProbeSize = getStackProbeSize(*MF);

36120

36121

MachineRegisterInfo &MRI = MF->getRegInfo();

36122

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36123

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36124

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36125

36126

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36127

MF->insert(MBBIter, testMBB);

36128

MF->insert(MBBIter, blockMBB);

36129

MF->insert(MBBIter, tailMBB);

36130

36131

Register sizeVReg = MI.getOperand(1).getReg();

36132

36133

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

36134

36135

Register TmpStackPtr = MRI.createVirtualRegister(

36136

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36137

Register FinalStackPtr = MRI.createVirtualRegister(

36138

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36139

36140

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

36141

.addReg(physSPReg);

36142

{

36143

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

36144

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

36145

.addReg(TmpStackPtr)

36146

.addReg(sizeVReg);

36147

}

36148

36149

// test rsp size

36150

36151

BuildMI(testMBB, DL,

36152

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

36153

.addReg(FinalStackPtr)

36154

.addReg(physSPReg);

36155

36156

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

36157

.addMBB(tailMBB)

36158

.addImm(X86::COND_GE);

36159

testMBB->addSuccessor(blockMBB);

36160

testMBB->addSuccessor(tailMBB);

36161

36162

// Touch the block then extend it. This is done on the opposite side of

36163

// static probe where we allocate then touch, to avoid the need of probing the

36164

// tail of the static alloca. Possible scenarios are:

36165

//

36166

// + ---- <- ------------ <- ------------- <- ------------ +

36167

// | |

36168

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

36169

// | |

36170

// + <- ----------- <- ------------ <- ----------- <- ------------ +

36171

//

36172

// The property we want to enforce is to never have more than [page alloc] between two probes.

36173

36174

const unsigned XORMIOpc =

36175

TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;

36176

addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)

36177

.addImm(0);

36178

36179

BuildMI(blockMBB, DL,

36180

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

36181

.addReg(physSPReg)

36182

.addImm(ProbeSize);

36183

36184

36185

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

36186

blockMBB->addSuccessor(testMBB);

36187

36188

// Replace original instruction by the expected stack ptr

36189

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

36190

.addReg(FinalStackPtr);

36191

36192

tailMBB->splice(tailMBB->end(), MBB,

36193

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36194

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

36195

MBB->addSuccessor(testMBB);

36196

36197

// Delete the original pseudo instruction.

36198

MI.eraseFromParent();

36199

36200

// And we're done.

36201

return tailMBB;

36202

}

36203

36204

MachineBasicBlock *

36205

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

36206

MachineBasicBlock *BB) const {

36207

MachineFunction *MF = BB->getParent();

36208

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36209

const DebugLoc &DL = MI.getDebugLoc();

36210

const BasicBlock *LLVM_BB = BB->getBasicBlock();

36211

36212

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36212, __extension__ __PRETTY_FUNCTION__));

36213

36214

const bool Is64Bit = Subtarget.is64Bit();

36215

const bool IsLP64 = Subtarget.isTarget64BitLP64();

36216

36217

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

36218

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

36219

36220

// BB:

36221

// ... [Till the alloca]

36222

// If stacklet is not large enough, jump to mallocMBB

36223

//

36224

// bumpMBB:

36225

// Allocate by subtracting from RSP

36226

// Jump to continueMBB

36227

//

36228

// mallocMBB:

36229

// Allocate by call to runtime

36230

//

36231

// continueMBB:

36232

// ...

36233

// [rest of original BB]

36234

//

36235

36236

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36237

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36238

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36239

36240

MachineRegisterInfo &MRI = MF->getRegInfo();

36241

const TargetRegisterClass *AddrRegClass =

36242

getRegClassFor(getPointerTy(MF->getDataLayout()));

36243

36244

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36245

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36246

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

36247

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

36248

sizeVReg = MI.getOperand(1).getReg(),

36249

physSPReg =

36250

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

36251

36252

MachineFunction::iterator MBBIter = ++BB->getIterator();

36253

36254

MF->insert(MBBIter, bumpMBB);

36255

MF->insert(MBBIter, mallocMBB);

36256

MF->insert(MBBIter, continueMBB);

36257

36258

continueMBB->splice(continueMBB->begin(), BB,

36259

std::next(MachineBasicBlock::iterator(MI)), BB->end());

36260

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

36261

36262

// Add code to the main basic block to check if the stack limit has been hit,

36263

// and if so, jump to mallocMBB otherwise to bumpMBB.

36264

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

36265

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

36266

.addReg(tmpSPVReg).addReg(sizeVReg);

36267

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

36268

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

36269

.addReg(SPLimitVReg);

36270

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

36271

36272

// bumpMBB simply decreases the stack pointer, since we know the current

36273

// stacklet has enough space.

36274

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

36275

.addReg(SPLimitVReg);

36276

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

36277

.addReg(SPLimitVReg);

36278

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36279

36280

// Calls into a routine in libgcc to allocate more space from the heap.

36281

const uint32_t *RegMask =

36282

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

36283

if (IsLP64) {

36284

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

36285

.addReg(sizeVReg);

36286

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36287

.addExternalSymbol("__morestack_allocate_stack_space")

36288

.addRegMask(RegMask)

36289

.addReg(X86::RDI, RegState::Implicit)

36290

.addReg(X86::RAX, RegState::ImplicitDefine);

36291

} else if (Is64Bit) {

36292

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

36293

.addReg(sizeVReg);

36294

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36295

.addExternalSymbol("__morestack_allocate_stack_space")

36296

.addRegMask(RegMask)

36297

.addReg(X86::EDI, RegState::Implicit)

36298

.addReg(X86::EAX, RegState::ImplicitDefine);

36299

} else {

36300

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

36301

.addImm(12);

36302

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

36303

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

36304

.addExternalSymbol("__morestack_allocate_stack_space")

36305

.addRegMask(RegMask)

36306

.addReg(X86::EAX, RegState::ImplicitDefine);

36307

}

36308

36309

if (!Is64Bit)

36310

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

36311

.addImm(16);

36312

36313

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

36314

.addReg(IsLP64 ? X86::RAX : X86::EAX);

36315

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36316

36317

// Set up the CFG correctly.

36318

BB->addSuccessor(bumpMBB);

36319

BB->addSuccessor(mallocMBB);

36320

mallocMBB->addSuccessor(continueMBB);

36321

bumpMBB->addSuccessor(continueMBB);

36322

36323

// Take care of the PHI nodes.

36324

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

36325

MI.getOperand(0).getReg())

36326

.addReg(mallocPtrVReg)

36327

.addMBB(mallocMBB)

36328

.addReg(bumpSPPtrVReg)

36329

.addMBB(bumpMBB);

36330

36331

// Delete the original pseudo instruction.

36332

MI.eraseFromParent();

36333

36334

// And we're done.

36335

return continueMBB;

36336

}

36337

36338

MachineBasicBlock *

36339

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

36340

MachineBasicBlock *BB) const {

36341

MachineFunction *MF = BB->getParent();

36342

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36343

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

36344

const DebugLoc &DL = MI.getDebugLoc();

36345

36346

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36348, __extension__
__PRETTY_FUNCTION__))

36347

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36348, __extension__
__PRETTY_FUNCTION__))

36348

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36348, __extension__
__PRETTY_FUNCTION__));

36349

36350

// Only 32-bit EH needs to worry about manually restoring stack pointers.

36351

if (!Subtarget.is32Bit())

36352

return BB;

36353

36354

// C++ EH creates a new target block to hold the restore code, and wires up

36355

// the new block to the return destination with a normal JMP_4.

36356

MachineBasicBlock *RestoreMBB =

36357

MF->CreateMachineBasicBlock(BB->getBasicBlock());

36358

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36358, __extension__ __PRETTY_FUNCTION__));

36359

MF->insert(std::next(BB->getIterator()), RestoreMBB);

36360

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

36361

BB->addSuccessor(RestoreMBB);

36362

MI.getOperand(0).setMBB(RestoreMBB);

36363

36364

// Marking this as an EH pad but not a funclet entry block causes PEI to

36365

// restore stack pointers in the block.

36366

RestoreMBB->setIsEHPad(true);

36367

36368

auto RestoreMBBI = RestoreMBB->begin();

36369

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

36370

return BB;

36371

}

36372

36373

MachineBasicBlock *

36374

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

36375

MachineBasicBlock *BB) const {

36376

// So, here we replace TLSADDR with the sequence:

36377

// adjust_stackdown -> TLSADDR -> adjust_stackup.

36378

// We need this because TLSADDR is lowered into calls

36379

// inside MC, therefore without the two markers shrink-wrapping

36380

// may push the prologue/epilogue pass them.

36381

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36382

const DebugLoc &DL = MI.getDebugLoc();

36383

MachineFunction &MF = *BB->getParent();

36384

36385

// Emit CALLSEQ_START right before the instruction.

36386

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

36387

MachineInstrBuilder CallseqStart =

36388

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

36389

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

36390

36391

// Emit CALLSEQ_END right after the instruction.

36392

// We don't call erase from parent because we want to keep the

36393

// original instruction around.

36394

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

36395

MachineInstrBuilder CallseqEnd =

36396

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

36397

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

36398

36399

return BB;

36400

}

36401

36402

MachineBasicBlock *

36403

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

36404

MachineBasicBlock *BB) const {

36405

// This is pretty easy. We're taking the value that we received from

36406

// our load from the relocation, sticking it in either RDI (x86-64)

36407

// or EAX and doing an indirect call. The return value will then

36408

// be in the normal return register.

36409

MachineFunction *F = BB->getParent();

36410

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36411

const DebugLoc &DL = MI.getDebugLoc();

36412

36413

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36413, __extension__
__PRETTY_FUNCTION__));

36414

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36414, __extension__
__PRETTY_FUNCTION__));

36415

36416

// Get a register mask for the lowered call.

36417

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

36418

// proper register mask.

36419

const uint32_t *RegMask =

36420

Subtarget.is64Bit() ?

36421

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

36422

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

36423

if (Subtarget.is64Bit()) {

36424

MachineInstrBuilder MIB =

36425

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

36426

.addReg(X86::RIP)

36427

.addImm(0)

36428

.addReg(0)

36429

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36430

MI.getOperand(3).getTargetFlags())

36431

.addReg(0);

36432

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

36433

addDirectMem(MIB, X86::RDI);

36434

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

36435

} else if (!isPositionIndependent()) {

36436

MachineInstrBuilder MIB =

36437

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36438

.addReg(0)

36439

.addImm(0)

36440

.addReg(0)

36441

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36442

MI.getOperand(3).getTargetFlags())

36443

.addReg(0);

36444

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36445

addDirectMem(MIB, X86::EAX);

36446

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36447

} else {

36448

MachineInstrBuilder MIB =

36449

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36450

.addReg(TII->getGlobalBaseReg(F))

36451

.addImm(0)

36452

.addReg(0)

36453

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36454

MI.getOperand(3).getTargetFlags())

36455

.addReg(0);

36456

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36457

addDirectMem(MIB, X86::EAX);

36458

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36459

}

36460

36461

MI.eraseFromParent(); // The pseudo instruction is gone now.

36462

return BB;

36463

}

36464

36465

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

36466

switch (RPOpc) {

36467

case X86::INDIRECT_THUNK_CALL32:

36468

return X86::CALLpcrel32;

36469

case X86::INDIRECT_THUNK_CALL64:

36470

return X86::CALL64pcrel32;

36471

case X86::INDIRECT_THUNK_TCRETURN32:

36472

return X86::TCRETURNdi;

36473

case X86::INDIRECT_THUNK_TCRETURN64:

36474

return X86::TCRETURNdi64;

36475

}

36476

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36476);

36477

}

36478

36479

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

36480

unsigned Reg) {

36481

if (Subtarget.useRetpolineExternalThunk()) {

36482

// When using an external thunk for retpolines, we pick names that match the

36483

// names GCC happens to use as well. This helps simplify the implementation

36484

// of the thunks for kernels where they have no easy ability to create

36485

// aliases and are doing non-trivial configuration of the thunk's body. For

36486

// example, the Linux kernel will do boot-time hot patching of the thunk

36487

// bodies and cannot easily export aliases of these to loaded modules.

36488

//

36489

// Note that at any point in the future, we may need to change the semantics

36490

// of how we implement retpolines and at that time will likely change the

36491

// name of the called thunk. Essentially, there is no hard guarantee that

36492

// LLVM will generate calls to specific thunks, we merely make a best-effort

36493

// attempt to help out kernels and other systems where duplicating the

36494

// thunks is costly.

36495

switch (Reg) {

36496

case X86::EAX:

36497

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36497, __extension__
__PRETTY_FUNCTION__));

36498

return "__x86_indirect_thunk_eax";

36499

case X86::ECX:

36500

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36500, __extension__
__PRETTY_FUNCTION__));

36501

return "__x86_indirect_thunk_ecx";

36502

case X86::EDX:

36503

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36503, __extension__
__PRETTY_FUNCTION__));

36504

return "__x86_indirect_thunk_edx";

36505

case X86::EDI:

36506

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36506, __extension__
__PRETTY_FUNCTION__));

36507

return "__x86_indirect_thunk_edi";

36508

case X86::R11:

36509

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36509, __extension__
__PRETTY_FUNCTION__));

36510

return "__x86_indirect_thunk_r11";

36511

}

36512

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36512);

36513

}

36514

36515

if (Subtarget.useRetpolineIndirectCalls() ||

36516

Subtarget.useRetpolineIndirectBranches()) {

36517

// When targeting an internal COMDAT thunk use an LLVM-specific name.

36518

switch (Reg) {

36519

case X86::EAX:

36520

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36520, __extension__
__PRETTY_FUNCTION__));

36521

return "__llvm_retpoline_eax";

36522

case X86::ECX:

36523

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36523, __extension__
__PRETTY_FUNCTION__));

36524

return "__llvm_retpoline_ecx";

36525

case X86::EDX:

36526

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36526, __extension__
__PRETTY_FUNCTION__));

36527

return "__llvm_retpoline_edx";

36528

case X86::EDI:

36529

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36529, __extension__
__PRETTY_FUNCTION__));

36530

return "__llvm_retpoline_edi";

36531

case X86::R11:

36532

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36532, __extension__
__PRETTY_FUNCTION__));

36533

return "__llvm_retpoline_r11";

36534

}

36535

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36535);

36536

}

36537

36538

if (Subtarget.useLVIControlFlowIntegrity()) {

36539

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36539, __extension__
__PRETTY_FUNCTION__));

36540

return "__llvm_lvi_thunk_r11";

36541

}

36542

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36542);

36543

}

36544

36545

MachineBasicBlock *

36546

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

36547

MachineBasicBlock *BB) const {

36548

// Copy the virtual register into the R11 physical register and

36549

// call the retpoline thunk.

36550

const DebugLoc &DL = MI.getDebugLoc();

36551

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36552

Register CalleeVReg = MI.getOperand(0).getReg();

36553

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

36554

36555

// Find an available scratch register to hold the callee. On 64-bit, we can

36556

// just use R11, but we scan for uses anyway to ensure we don't generate

36557

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

36558

// already a register use operand to the call to hold the callee. If none

36559

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

36560

// register and ESI is the base pointer to realigned stack frames with VLAs.

36561

SmallVector<unsigned, 3> AvailableRegs;

36562

if (Subtarget.is64Bit())

36563

AvailableRegs.push_back(X86::R11);

36564

else

36565

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

36566

36567

// Zero out any registers that are already used.

36568

for (const auto &MO : MI.operands()) {

36569

if (MO.isReg() && MO.isUse())

36570

for (unsigned &Reg : AvailableRegs)

36571

if (Reg == MO.getReg())

36572

Reg = 0;

36573

}

36574

36575

// Choose the first remaining non-zero available register.

36576

unsigned AvailableReg = 0;

36577

for (unsigned MaybeReg : AvailableRegs) {

36578

if (MaybeReg) {

36579

AvailableReg = MaybeReg;

36580

break;

36581

}

36582

}

36583

if (!AvailableReg)

36584

report_fatal_error("calling convention incompatible with retpoline, no "

36585

"available registers");

36586

36587

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

36588

36589

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

36590

.addReg(CalleeVReg);

36591

MI.getOperand(0).ChangeToES(Symbol);

36592

MI.setDesc(TII->get(Opc));

36593

MachineInstrBuilder(*BB->getParent(), &MI)

36594

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

36595

return BB;

36596

}

36597

36598

/// SetJmp implies future control flow change upon calling the corresponding

36599

/// LongJmp.

36600

/// Instead of using the 'return' instruction, the long jump fixes the stack and

36601

/// performs an indirect branch. To do so it uses the registers that were stored

36602

/// in the jump buffer (when calling SetJmp).

36603

/// In case the shadow stack is enabled we need to fix it as well, because some

36604

/// return addresses will be skipped.

36605

/// The function will save the SSP for future fixing in the function

36606

/// emitLongJmpShadowStackFix.

36607

/// \sa emitLongJmpShadowStackFix

36608

/// \param [in] MI The temporary Machine Instruction for the builtin.

36609

/// \param [in] MBB The Machine Basic Block that will be modified.

36610

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

36611

MachineBasicBlock *MBB) const {

36612

const DebugLoc &DL = MI.getDebugLoc();

36613

MachineFunction *MF = MBB->getParent();

36614

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36615

MachineRegisterInfo &MRI = MF->getRegInfo();

36616

MachineInstrBuilder MIB;

36617

36618

// Memory Reference.

36619

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36620

MI.memoperands_end());

36621

36622

// Initialize a register with zero.

36623

MVT PVT = getPointerTy(MF->getDataLayout());

36624

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

36625

Register ZReg = MRI.createVirtualRegister(PtrRC);

36626

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

36627

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

36628

.addDef(ZReg)

36629

.addReg(ZReg, RegState::Undef)

36630

.addReg(ZReg, RegState::Undef);

36631

36632

// Read the current SSP Register value to the zeroed register.

36633

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

36634

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

36635

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

36636

36637

// Write the SSP register value to offset 3 in input memory buffer.

36638

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

36639

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

36640

const int64_t SSPOffset = 3 * PVT.getStoreSize();

36641

const unsigned MemOpndSlot = 1;

36642

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36643

if (i == X86::AddrDisp)

36644

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

36645

else

36646

MIB.add(MI.getOperand(MemOpndSlot + i));

36647

}

36648

MIB.addReg(SSPCopyReg);

36649

MIB.setMemRefs(MMOs);

36650

}

36651

36652

MachineBasicBlock *

36653

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

36654

MachineBasicBlock *MBB) const {

36655

const DebugLoc &DL = MI.getDebugLoc();

36656

MachineFunction *MF = MBB->getParent();

36657

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36658

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36659

MachineRegisterInfo &MRI = MF->getRegInfo();

36660

36661

const BasicBlock *BB = MBB->getBasicBlock();

36662

MachineFunction::iterator I = ++MBB->getIterator();

36663

36664

// Memory Reference

36665

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36666

MI.memoperands_end());

36667

36668

unsigned DstReg;

36669

unsigned MemOpndSlot = 0;

36670

36671

unsigned CurOp = 0;

36672

36673

DstReg = MI.getOperand(CurOp++).getReg();

36674

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

36675

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36675, __extension__
__PRETTY_FUNCTION__));

36676

(void)TRI;

36677

Register mainDstReg = MRI.createVirtualRegister(RC);

36678

Register restoreDstReg = MRI.createVirtualRegister(RC);

36679

36680

MemOpndSlot = CurOp;

36681

36682

MVT PVT = getPointerTy(MF->getDataLayout());

36683

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36684, __extension__
__PRETTY_FUNCTION__))

36684

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36684, __extension__
__PRETTY_FUNCTION__));

36685

36686

// For v = setjmp(buf), we generate

36687

//

36688

// thisMBB:

36689

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

36690

// SjLjSetup restoreMBB

36691

//

36692

// mainMBB:

36693

// v_main = 0

36694

//

36695

// sinkMBB:

36696

// v = phi(main, restore)

36697

//

36698

// restoreMBB:

36699

// if base pointer being used, load it from frame

36700

// v_restore = 1

36701

36702

MachineBasicBlock *thisMBB = MBB;

36703

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

36704

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

36705

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

36706

MF->insert(I, mainMBB);

36707

MF->insert(I, sinkMBB);

36708

MF->push_back(restoreMBB);

36709

restoreMBB->setMachineBlockAddressTaken();

36710

36711

MachineInstrBuilder MIB;

36712

36713

// Transfer the remainder of BB and its successor edges to sinkMBB.

36714

sinkMBB->splice(sinkMBB->begin(), MBB,

36715

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36716

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

36717

36718

// thisMBB:

36719

unsigned PtrStoreOpc = 0;

36720

unsigned LabelReg = 0;

36721

const int64_t LabelOffset = 1 * PVT.getStoreSize();

36722

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

36723

!isPositionIndependent();

36724

36725

// Prepare IP either in reg or imm.

36726

if (!UseImmLabel) {

36727

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

36728

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

36729

LabelReg = MRI.createVirtualRegister(PtrRC);

36730

if (Subtarget.is64Bit()) {

36731

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

36732

.addReg(X86::RIP)

36733

.addImm(0)

36734

.addReg(0)

36735

.addMBB(restoreMBB)

36736

.addReg(0);

36737

} else {

36738

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

36739

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

36740

.addReg(XII->getGlobalBaseReg(MF))

36741

.addImm(0)

36742

.addReg(0)

36743

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

36744

.addReg(0);

36745

}

36746

} else

36747

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

36748

// Store IP

36749

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

36750

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36751

if (i == X86::AddrDisp)

36752

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

36753

else

36754

MIB.add(MI.getOperand(MemOpndSlot + i));

36755

}

36756

if (!UseImmLabel)

36757

MIB.addReg(LabelReg);

36758

else

36759

MIB.addMBB(restoreMBB);

36760

MIB.setMemRefs(MMOs);

36761

36762

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

36763

emitSetJmpShadowStackFix(MI, thisMBB);

36764

}

36765

36766

// Setup

36767

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

36768

.addMBB(restoreMBB);

36769

36770

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

36771

MIB.addRegMask(RegInfo->getNoPreservedMask());

36772

thisMBB->addSuccessor(mainMBB);

36773

thisMBB->addSuccessor(restoreMBB);

36774

36775

// mainMBB:

36776

// EAX = 0

36777

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

36778

mainMBB->addSuccessor(sinkMBB);

36779

36780

// sinkMBB:

36781

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

36782

TII->get(X86::PHI), DstReg)

36783

.addReg(mainDstReg).addMBB(mainMBB)

36784

.addReg(restoreDstReg).addMBB(restoreMBB);

36785

36786

// restoreMBB:

36787

if (RegInfo->hasBasePointer(*MF)) {

36788

const bool Uses64BitFramePtr =

36789

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

36790

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

36791

X86FI->setRestoreBasePointer(MF);

36792

Register FramePtr = RegInfo->getFrameRegister(*MF);

36793

Register BasePtr = RegInfo->getBaseRegister();

36794

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

36795

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

36796

FramePtr, true, X86FI->getRestoreBasePointerOffset())

36797

.setMIFlag(MachineInstr::FrameSetup);

36798

}

36799

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

36800

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

36801

restoreMBB->addSuccessor(sinkMBB);

36802

36803

MI.eraseFromParent();

36804

return sinkMBB;

36805

}

36806

36807

/// Fix the shadow stack using the previously saved SSP pointer.

36808

/// \sa emitSetJmpShadowStackFix

36809

/// \param [in] MI The temporary Machine Instruction for the builtin.

36810

/// \param [in] MBB The Machine Basic Block that will be modified.

36811

/// \return The sink MBB that will perform the future indirect branch.

36812

MachineBasicBlock *

36813

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

36814

MachineBasicBlock *MBB) const {

36815

const DebugLoc &DL = MI.getDebugLoc();

36816

MachineFunction *MF = MBB->getParent();

36817

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36818

MachineRegisterInfo &MRI = MF->getRegInfo();

36819

36820

// Memory Reference

36821

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36822

MI.memoperands_end());

36823

36824

MVT PVT = getPointerTy(MF->getDataLayout());

36825

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

36826

36827

// checkSspMBB:

36828

// xor vreg1, vreg1

36829

// rdssp vreg1

36830

// test vreg1, vreg1

36831

// je sinkMBB # Jump if Shadow Stack is not supported

36832

// fallMBB:

36833

// mov buf+24/12(%rip), vreg2

36834

// sub vreg1, vreg2

36835

// jbe sinkMBB # No need to fix the Shadow Stack

36836

// fixShadowMBB:

36837

// shr 3/2, vreg2

36838

// incssp vreg2 # fix the SSP according to the lower 8 bits

36839

// shr 8, vreg2

36840

// je sinkMBB

36841

// fixShadowLoopPrepareMBB:

36842

// shl vreg2

36843

// mov 128, vreg3

36844

// fixShadowLoopMBB:

36845

// incssp vreg3

36846

// dec vreg2

36847

// jne fixShadowLoopMBB # Iterate until you finish fixing

36848

// # the Shadow Stack

36849

// sinkMBB:

36850

36851

MachineFunction::iterator I = ++MBB->getIterator();

36852

const BasicBlock *BB = MBB->getBasicBlock();

36853

36854

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

36855

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

36856

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

36857

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

36858

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

36859

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

36860

MF->insert(I, checkSspMBB);

36861

MF->insert(I, fallMBB);

36862

MF->insert(I, fixShadowMBB);

36863

MF->insert(I, fixShadowLoopPrepareMBB);

36864

MF->insert(I, fixShadowLoopMBB);

36865

MF->insert(I, sinkMBB);

36866

36867

// Transfer the remainder of BB and its successor edges to sinkMBB.

36868

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

36869

MBB->end());

36870

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

36871

36872

MBB->addSuccessor(checkSspMBB);

36873

36874

// Initialize a register with zero.

36875

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

36876

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

36877

36878

if (PVT == MVT::i64) {

36879

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

36880

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

36881

.addImm(0)

36882

.addReg(ZReg)

36883

.addImm(X86::sub_32bit);

36884

ZReg = TmpZReg;

36885

}

36886

36887

// Read the current SSP Register value to the zeroed register.

36888

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

36889

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

36890

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

36891

36892

// Check whether the result of the SSP register is zero and jump directly

36893

// to the sink.

36894

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

36895

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

36896

.addReg(SSPCopyReg)

36897

.addReg(SSPCopyReg);

36898

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

36899

checkSspMBB->addSuccessor(sinkMBB);

36900

checkSspMBB->addSuccessor(fallMBB);

36901

36902

// Reload the previously saved SSP register value.

36903

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

36904

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

36905

const int64_t SPPOffset = 3 * PVT.getStoreSize();

36906

MachineInstrBuilder MIB =

36907

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

36908

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36909

const MachineOperand &MO = MI.getOperand(i);

36910

if (i == X86::AddrDisp)

36911

MIB.addDisp(MO, SPPOffset);

36912

else if (MO.isReg()) // Don't add the whole operand, we don't want to

36913

// preserve kill flags.

36914

MIB.addReg(MO.getReg());

36915

else

36916

MIB.add(MO);

36917

}

36918

MIB.setMemRefs(MMOs);

36919

36920

// Subtract the current SSP from the previous SSP.

36921

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

36922

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

36923

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

36924

.addReg(PrevSSPReg)

36925

.addReg(SSPCopyReg);

36926

36927

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

36928

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

36929

fallMBB->addSuccessor(sinkMBB);

36930

fallMBB->addSuccessor(fixShadowMBB);

36931

36932

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

36933

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

36934

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

36935

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

36936

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

36937

.addReg(SspSubReg)

36938

.addImm(Offset);

36939

36940

// Increase SSP when looking only on the lower 8 bits of the delta.

36941

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

36942

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

36943

36944

// Reset the lower 8 bits.

36945

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

36946

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

36947

.addReg(SspFirstShrReg)

36948

.addImm(8);

36949

36950

// Jump if the result of the shift is zero.

36951

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

36952

fixShadowMBB->addSuccessor(sinkMBB);

36953

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

36954

36955

// Do a single shift left.

36956

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

36957

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

36958

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

36959

.addReg(SspSecondShrReg);

36960

36961

// Save the value 128 to a register (will be used next with incssp).

36962

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

36963

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

36964

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

36965

.addImm(128);

36966

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

36967

36968

// Since incssp only looks at the lower 8 bits, we might need to do several

36969

// iterations of incssp until we finish fixing the shadow stack.

36970

Register DecReg = MRI.createVirtualRegister(PtrRC);

36971

Register CounterReg = MRI.createVirtualRegister(PtrRC);

36972

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

36973

.addReg(SspAfterShlReg)

36974

.addMBB(fixShadowLoopPrepareMBB)

36975

.addReg(DecReg)

36976

.addMBB(fixShadowLoopMBB);

36977

36978

// Every iteration we increase the SSP by 128.

36979

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

36980

36981

// Every iteration we decrement the counter by 1.

36982

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

36983

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

36984

36985

// Jump if the counter is not zero yet.

36986

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

36987

fixShadowLoopMBB->addSuccessor(sinkMBB);

36988

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

36989

36990

return sinkMBB;

36991

}

36992

36993

MachineBasicBlock *

36994

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

36995

MachineBasicBlock *MBB) const {

36996

const DebugLoc &DL = MI.getDebugLoc();

36997

MachineFunction *MF = MBB->getParent();

36998

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36999

MachineRegisterInfo &MRI = MF->getRegInfo();

37000

37001

// Memory Reference

37002

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37003

MI.memoperands_end());

37004

37005

MVT PVT = getPointerTy(MF->getDataLayout());

37006

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37007, __extension__
__PRETTY_FUNCTION__))

37007

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37007, __extension__
__PRETTY_FUNCTION__));

37008

37009

const TargetRegisterClass *RC =

37010

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37011

Register Tmp = MRI.createVirtualRegister(RC);

37012

// Since FP is only updated here but NOT referenced, it's treated as GPR.

37013

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37014

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

37015

Register SP = RegInfo->getStackRegister();

37016

37017

MachineInstrBuilder MIB;

37018

37019

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37020

const int64_t SPOffset = 2 * PVT.getStoreSize();

37021

37022

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37023

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

37024

37025

MachineBasicBlock *thisMBB = MBB;

37026

37027

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

37028

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37029

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

37030

}

37031

37032

// Reload FP

37033

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

37034

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37035

const MachineOperand &MO = MI.getOperand(i);

37036

if (MO.isReg()) // Don't add the whole operand, we don't want to

37037

// preserve kill flags.

37038

MIB.addReg(MO.getReg());

37039

else

37040

MIB.add(MO);

37041

}

37042

MIB.setMemRefs(MMOs);

37043

37044

// Reload IP

37045

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

37046

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37047

const MachineOperand &MO = MI.getOperand(i);

37048

if (i == X86::AddrDisp)

37049

MIB.addDisp(MO, LabelOffset);

37050

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37051

// preserve kill flags.

37052

MIB.addReg(MO.getReg());

37053

else

37054

MIB.add(MO);

37055

}

37056

MIB.setMemRefs(MMOs);

37057

37058

// Reload SP

37059

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

37060

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37061

if (i == X86::AddrDisp)

37062

MIB.addDisp(MI.getOperand(i), SPOffset);

37063

else

37064

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

37065

// the last instruction of the expansion.

37066

}

37067

MIB.setMemRefs(MMOs);

37068

37069

// Jump

37070

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

37071

37072

MI.eraseFromParent();

37073

return thisMBB;

37074

}

37075

37076

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

37077

MachineBasicBlock *MBB,

37078

MachineBasicBlock *DispatchBB,

37079

int FI) const {

37080

const DebugLoc &DL = MI.getDebugLoc();

37081

MachineFunction *MF = MBB->getParent();

37082

MachineRegisterInfo *MRI = &MF->getRegInfo();

37083

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37084

37085

MVT PVT = getPointerTy(MF->getDataLayout());

37086

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37086, __extension__
__PRETTY_FUNCTION__));

37087

37088

unsigned Op = 0;

37089

unsigned VR = 0;

37090

37091

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37092

!isPositionIndependent();

37093

37094

if (UseImmLabel) {

37095

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37096

} else {

37097

const TargetRegisterClass *TRC =

37098

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37099

VR = MRI->createVirtualRegister(TRC);

37100

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37101

37102

if (Subtarget.is64Bit())

37103

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

37104

.addReg(X86::RIP)

37105

.addImm(1)

37106

.addReg(0)

37107

.addMBB(DispatchBB)

37108

.addReg(0);

37109

else

37110

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

37111

.addReg(0) /* TII->getGlobalBaseReg(MF) */

37112

.addImm(1)

37113

.addReg(0)

37114

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

37115

.addReg(0);

37116

}

37117

37118

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

37119

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

37120

if (UseImmLabel)

37121

MIB.addMBB(DispatchBB);

37122

else

37123

MIB.addReg(VR);

37124

}

37125

37126

MachineBasicBlock *

37127

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

37128

MachineBasicBlock *BB) const {

37129

const DebugLoc &DL = MI.getDebugLoc();

37130

MachineFunction *MF = BB->getParent();

37131

MachineRegisterInfo *MRI = &MF->getRegInfo();

37132

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37133

int FI = MF->getFrameInfo().getFunctionContextIndex();

37134

37135

// Get a mapping of the call site numbers to all of the landing pads they're

37136

// associated with.

37137

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

37138

unsigned MaxCSNum = 0;

37139

for (auto &MBB : *MF) {

37140

if (!MBB.isEHPad())

37141

continue;

37142

37143

MCSymbol *Sym = nullptr;

37144

for (const auto &MI : MBB) {

37145

if (MI.isDebugInstr())

37146

continue;

37147

37148

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37148, __extension__
__PRETTY_FUNCTION__));

37149

Sym = MI.getOperand(0).getMCSymbol();

37150

break;

37151

}

37152

37153

if (!MF->hasCallSiteLandingPad(Sym))

37154

continue;

37155

37156

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

37157

CallSiteNumToLPad[CSI].push_back(&MBB);

37158

MaxCSNum = std::max(MaxCSNum, CSI);

37159

}

37160

}

37161

37162

// Get an ordered list of the machine basic blocks for the jump table.

37163

std::vector<MachineBasicBlock *> LPadList;

37164

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

37165

LPadList.reserve(CallSiteNumToLPad.size());

37166

37167

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

37168

for (auto &LP : CallSiteNumToLPad[CSI]) {

37169

LPadList.push_back(LP);

37170

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

37171

}

37172

}

37173

37174

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37175, __extension__
__PRETTY_FUNCTION__))

37175

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37175, __extension__
__PRETTY_FUNCTION__));

37176

37177

// Create the MBBs for the dispatch code.

37178

37179

// Shove the dispatch's address into the return slot in the function context.

37180

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

37181

DispatchBB->setIsEHPad(true);

37182

37183

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

37184

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

37185

DispatchBB->addSuccessor(TrapBB);

37186

37187

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

37188

DispatchBB->addSuccessor(DispContBB);

37189

37190

// Insert MBBs.

37191

MF->push_back(DispatchBB);

37192

MF->push_back(DispContBB);

37193

MF->push_back(TrapBB);

37194

37195

// Insert code into the entry block that creates and registers the function

37196

// context.

37197

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

37198

37199

// Create the jump table and associated information

37200

unsigned JTE = getJumpTableEncoding();

37201

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

37202

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

37203

37204

const X86RegisterInfo &RI = TII->getRegisterInfo();

37205

// Add a register mask with no preserved registers. This results in all

37206

// registers being marked as clobbered.

37207

if (RI.hasBasePointer(*MF)) {

37208

const bool FPIs64Bit =

37209

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37210

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

37211

MFI->setRestoreBasePointer(MF);

37212

37213

Register FP = RI.getFrameRegister(*MF);

37214

Register BP = RI.getBaseRegister();

37215

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

37216

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

37217

MFI->getRestoreBasePointerOffset())

37218

.addRegMask(RI.getNoPreservedMask());

37219

} else {

37220

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

37221

.addRegMask(RI.getNoPreservedMask());

37222

}

37223

37224

// IReg is used as an index in a memory operand and therefore can't be SP

37225

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

37226

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

37227

Subtarget.is64Bit() ? 8 : 4);

37228

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

37229

.addReg(IReg)

37230

.addImm(LPadList.size());

37231

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

37232

37233

if (Subtarget.is64Bit()) {

37234

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37235

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

37236

37237

// leaq .LJTI0_0(%rip), BReg

37238

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

37239

.addReg(X86::RIP)

37240

.addImm(1)

37241

.addReg(0)

37242

.addJumpTableIndex(MJTI)

37243

.addReg(0);

37244

// movzx IReg64, IReg

37245

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

37246

.addImm(0)

37247

.addReg(IReg)

37248

.addImm(X86::sub_32bit);

37249

37250

switch (JTE) {

37251

case MachineJumpTableInfo::EK_BlockAddress:

37252

// jmpq *(BReg,IReg64,8)

37253

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

37254

.addReg(BReg)

37255

.addImm(8)

37256

.addReg(IReg64)

37257

.addImm(0)

37258

.addReg(0);

37259

break;

37260

case MachineJumpTableInfo::EK_LabelDifference32: {

37261

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

37262

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

37263

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37264

37265

// movl (BReg,IReg64,4), OReg

37266

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

37267

.addReg(BReg)

37268

.addImm(4)

37269

.addReg(IReg64)

37270

.addImm(0)

37271

.addReg(0);

37272

// movsx OReg64, OReg

37273

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

37274

// addq BReg, OReg64, TReg

37275

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

37276

.addReg(OReg64)

37277

.addReg(BReg);

37278

// jmpq *TReg

37279

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

37280

break;

37281

}

37282

default:

37283

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37283);

37284

}

37285

} else {

37286

// jmpl *.LJTI0_0(,IReg,4)

37287

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

37288

.addReg(0)

37289

.addImm(4)

37290

.addReg(IReg)

37291

.addJumpTableIndex(MJTI)

37292

.addReg(0);

37293

}

37294

37295

// Add the jump table entries as successors to the MBB.

37296

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

37297

for (auto &LP : LPadList)

37298

if (SeenMBBs.insert(LP).second)

37299

DispContBB->addSuccessor(LP);

37300

37301

// N.B. the order the invoke BBs are processed in doesn't matter here.

37302

SmallVector<MachineBasicBlock *, 64> MBBLPads;

37303

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

37304

for (MachineBasicBlock *MBB : InvokeBBs) {

37305

// Remove the landing pad successor from the invoke block and replace it

37306

// with the new dispatch block.

37307

// Keep a copy of Successors since it's modified inside the loop.

37308

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

37309

MBB->succ_rend());

37310

// FIXME: Avoid quadratic complexity.

37311

for (auto *MBBS : Successors) {

37312

if (MBBS->isEHPad()) {

37313

MBB->removeSuccessor(MBBS);

37314

MBBLPads.push_back(MBBS);

37315

}

37316

}

37317

37318

MBB->addSuccessor(DispatchBB);

37319

37320

// Find the invoke call and mark all of the callee-saved registers as

37321

// 'implicit defined' so that they're spilled. This prevents code from

37322

// moving instructions to before the EH block, where they will never be

37323

// executed.

37324

for (auto &II : reverse(*MBB)) {

37325

if (!II.isCall())

37326

continue;

37327

37328

DenseMap<unsigned, bool> DefRegs;

37329

for (auto &MOp : II.operands())

37330

if (MOp.isReg())

37331

DefRegs[MOp.getReg()] = true;

37332

37333

MachineInstrBuilder MIB(*MF, &II);

37334

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

37335

unsigned Reg = SavedRegs[RegIdx];

37336

if (!DefRegs[Reg])

37337

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

37338

}

37339

37340

break;

37341

}

37342

}

37343

37344

// Mark all former landing pads as non-landing pads. The dispatch is the only

37345

// landing pad now.

37346

for (auto &LP : MBBLPads)

37347

LP->setIsEHPad(false);

37348

37349

// The instruction is gone now.

37350

MI.eraseFromParent();

37351

return BB;

37352

}

37353

37354

MachineBasicBlock *

37355

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

37356

MachineBasicBlock *BB) const {

37357

MachineFunction *MF = BB->getParent();

37358

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37359

const DebugLoc &DL = MI.getDebugLoc();

37360

37361

auto TMMImmToTMMReg = [](unsigned Imm) {

37362

assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37362, __extension__
__PRETTY_FUNCTION__));

37363

return X86::TMM0 + Imm;

37364

};

37365

switch (MI.getOpcode()) {

37366

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37366);

37367

case X86::TLS_addr32:

37368

case X86::TLS_addr64:

37369

case X86::TLS_addrX32:

37370

case X86::TLS_base_addr32:

37371

case X86::TLS_base_addr64:

37372

case X86::TLS_base_addrX32:

37373

return EmitLoweredTLSAddr(MI, BB);

37374

case X86::INDIRECT_THUNK_CALL32:

37375

case X86::INDIRECT_THUNK_CALL64:

37376

case X86::INDIRECT_THUNK_TCRETURN32:

37377

case X86::INDIRECT_THUNK_TCRETURN64:

37378

return EmitLoweredIndirectThunk(MI, BB);

37379

case X86::CATCHRET:

37380

return EmitLoweredCatchRet(MI, BB);

37381

case X86::SEG_ALLOCA_32:

37382

case X86::SEG_ALLOCA_64:

37383

return EmitLoweredSegAlloca(MI, BB);

37384

case X86::PROBED_ALLOCA_32:

37385

case X86::PROBED_ALLOCA_64:

37386

return EmitLoweredProbedAlloca(MI, BB);

37387

case X86::TLSCall_32:

37388

case X86::TLSCall_64:

37389

return EmitLoweredTLSCall(MI, BB);

37390

case X86::CMOV_FR16:

37391

case X86::CMOV_FR16X:

37392

case X86::CMOV_FR32:

37393

case X86::CMOV_FR32X:

37394

case X86::CMOV_FR64:

37395

case X86::CMOV_FR64X:

37396

case X86::CMOV_GR8:

37397

case X86::CMOV_GR16:

37398

case X86::CMOV_GR32:

37399

case X86::CMOV_RFP32:

37400

case X86::CMOV_RFP64:

37401

case X86::CMOV_RFP80:

37402

case X86::CMOV_VR64:

37403

case X86::CMOV_VR128:

37404

case X86::CMOV_VR128X:

37405

case X86::CMOV_VR256:

37406

case X86::CMOV_VR256X:

37407

case X86::CMOV_VR512:

37408

case X86::CMOV_VK1:

37409

case X86::CMOV_VK2:

37410

case X86::CMOV_VK4:

37411

case X86::CMOV_VK8:

37412

case X86::CMOV_VK16:

37413

case X86::CMOV_VK32:

37414

case X86::CMOV_VK64:

37415

return EmitLoweredSelect(MI, BB);

37416

37417

case X86::FP80_ADDr:

37418

case X86::FP80_ADDm32: {

37419

// Change the floating point control register to use double extended

37420

// precision when performing the addition.

37421

int OrigCWFrameIdx =

37422

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37423

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),

37424

OrigCWFrameIdx);

37425

37426

// Load the old value of the control word...

37427

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37428

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

37429

OrigCWFrameIdx);

37430

37431

// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended

37432

// precision.

37433

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37434

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

37435

.addReg(OldCW, RegState::Kill)

37436

.addImm(0x300);

37437

37438

// Extract to 16 bits.

37439

Register NewCW16 =

37440

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

37441

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

37442

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

37443

37444

// Prepare memory for FLDCW.

37445

int NewCWFrameIdx =

37446

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37447

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

37448

NewCWFrameIdx)

37449

.addReg(NewCW16, RegState::Kill);

37450

37451

// Reload the modified control word now...

37452

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

37453

NewCWFrameIdx);

37454

37455

// Do the addition.

37456

if (MI.getOpcode() == X86::FP80_ADDr) {

37457

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))

37458

.add(MI.getOperand(0))

37459

.add(MI.getOperand(1))

37460

.add(MI.getOperand(2));

37461

} else {

37462

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))

37463

.add(MI.getOperand(0))

37464

.add(MI.getOperand(1))

37465

.add(MI.getOperand(2))

37466

.add(MI.getOperand(3))

37467

.add(MI.getOperand(4))

37468

.add(MI.getOperand(5))

37469

.add(MI.getOperand(6));

37470

}

37471

37472

// Reload the original control word now.

37473

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

37474

OrigCWFrameIdx);

37475

37476

MI.eraseFromParent(); // The pseudo instruction is gone now.

37477

return BB;

37478

}

37479

37480

case X86::FP32_TO_INT16_IN_MEM:

37481

case X86::FP32_TO_INT32_IN_MEM:

37482

case X86::FP32_TO_INT64_IN_MEM:

37483

case X86::FP64_TO_INT16_IN_MEM:

37484

case X86::FP64_TO_INT32_IN_MEM:

37485

case X86::FP64_TO_INT64_IN_MEM:

37486

case X86::FP80_TO_INT16_IN_MEM:

37487

case X86::FP80_TO_INT32_IN_MEM:

37488

case X86::FP80_TO_INT64_IN_MEM: {

37489

// Change the floating point control register to use "round towards zero"

37490

// mode when truncating to an integer value.

37491

int OrigCWFrameIdx =

37492

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37493

addFrameReference(BuildMI(*BB, MI, DL,

37494

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

37495

37496

// Load the old value of the control word...

37497

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37498

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

37499

OrigCWFrameIdx);

37500

37501

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

37502

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37503

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

37504

.addReg(OldCW, RegState::Kill).addImm(0xC00);

37505

37506

// Extract to 16 bits.

37507

Register NewCW16 =

37508

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

37509

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

37510

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

37511

37512

// Prepare memory for FLDCW.

37513

int NewCWFrameIdx =

37514

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37515

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

37516

NewCWFrameIdx)

37517

.addReg(NewCW16, RegState::Kill);

37518

37519

// Reload the modified control word now...

37520

addFrameReference(BuildMI(*BB, MI, DL,

37521

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

37522

37523

// Get the X86 opcode to use.

37524

unsigned Opc;

37525

switch (MI.getOpcode()) {

37526

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37526);

37527

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

37528

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

37529

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

37530

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

37531

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

37532

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

37533

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

37534

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

37535

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

37536

}

37537

37538

X86AddressMode AM = getAddressFromInstr(&MI, 0);

37539

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

37540

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

37541

37542

// Reload the original control word now.

37543

addFrameReference(BuildMI(*BB, MI, DL,

37544

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

37545

37546

MI.eraseFromParent(); // The pseudo instruction is gone now.

37547

return BB;

37548

}

37549

37550

// xbegin

37551

case X86::XBEGIN:

37552

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

37553

37554

case X86::VAARG_64:

37555

case X86::VAARG_X32:

37556

return EmitVAARGWithCustomInserter(MI, BB);

37557

37558

case X86::EH_SjLj_SetJmp32:

37559

case X86::EH_SjLj_SetJmp64:

37560

return emitEHSjLjSetJmp(MI, BB);

37561

37562

case X86::EH_SjLj_LongJmp32:

37563

case X86::EH_SjLj_LongJmp64:

37564

return emitEHSjLjLongJmp(MI, BB);

37565

37566

case X86::Int_eh_sjlj_setup_dispatch:

37567

return EmitSjLjDispatchBlock(MI, BB);

37568

37569

case TargetOpcode::STATEPOINT:

37570

// As an implementation detail, STATEPOINT shares the STACKMAP format at

37571

// this point in the process. We diverge later.

37572

return emitPatchPoint(MI, BB);

37573

37574

case TargetOpcode::STACKMAP:

37575

case TargetOpcode::PATCHPOINT:

37576

return emitPatchPoint(MI, BB);

37577

37578

case TargetOpcode::PATCHABLE_EVENT_CALL:

37579

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

37580

return BB;

37581

37582

case X86::LCMPXCHG8B: {

37583

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

37584

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

37585

// requires a memory operand. If it happens that current architecture is

37586

// i686 and for current function we need a base pointer

37587

// - which is ESI for i686 - register allocator would not be able to

37588

// allocate registers for an address in form of X(%reg, %reg, Y)

37589

// - there never would be enough unreserved registers during regalloc

37590

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

37591

// We are giving a hand to register allocator by precomputing the address in

37592

// a new vreg using LEA.

37593

37594

// If it is not i686 or there is no base pointer - nothing to do here.

37595

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

37596

return BB;

37597

37598

// Even though this code does not necessarily needs the base pointer to

37599

// be ESI, we check for that. The reason: if this assert fails, there are

37600

// some changes happened in the compiler base pointer handling, which most

37601

// probably have to be addressed somehow here.

37602

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37604, __extension__
__PRETTY_FUNCTION__))

37603

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37604, __extension__
__PRETTY_FUNCTION__))

37604

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37604, __extension__
__PRETTY_FUNCTION__));

37605

37606

MachineRegisterInfo &MRI = MF->getRegInfo();

37607

MVT SPTy = getPointerTy(MF->getDataLayout());

37608

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

37609

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

37610

37611

X86AddressMode AM = getAddressFromInstr(&MI, 0);

37612

// Regalloc does not need any help when the memory operand of CMPXCHG8B

37613

// does not use index register.

37614

if (AM.IndexReg == X86::NoRegister)

37615

return BB;

37616

37617

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

37618

// four operand definitions that are E[ABCD] registers. We skip them and

37619

// then insert the LEA.

37620

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

37621

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

37622

RMBBI->definesRegister(X86::EBX) ||

37623

RMBBI->definesRegister(X86::ECX) ||

37624

RMBBI->definesRegister(X86::EDX))) {

37625

++RMBBI;

37626

}

37627

MachineBasicBlock::iterator MBBI(RMBBI);

37628

addFullAddress(

37629

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

37630

37631

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

37632

37633

return BB;

37634

}

37635

case X86::LCMPXCHG16B_NO_RBX: {

37636

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

37637

Register BasePtr = TRI->getBaseRegister();

37638

if (TRI->hasBasePointer(*MF) &&

37639

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

37640

if (!BB->isLiveIn(BasePtr))

37641

BB->addLiveIn(BasePtr);

37642

// Save RBX into a virtual register.

37643

Register SaveRBX =

37644

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37645

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

37646

.addReg(X86::RBX);

37647

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37648

MachineInstrBuilder MIB =

37649

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

37650

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

37651

MIB.add(MI.getOperand(Idx));

37652

MIB.add(MI.getOperand(X86::AddrNumOperands));

37653

MIB.addReg(SaveRBX);

37654

} else {

37655

// Simple case, just copy the virtual register to RBX.

37656

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)

37657

.add(MI.getOperand(X86::AddrNumOperands));

37658

MachineInstrBuilder MIB =

37659

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));

37660

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

37661

MIB.add(MI.getOperand(Idx));

37662

}

37663

MI.eraseFromParent();

37664

return BB;

37665

}

37666

case X86::MWAITX: {

37667

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

37668

Register BasePtr = TRI->getBaseRegister();

37669

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

37670

// If no need to save the base pointer, we generate MWAITXrrr,

37671

// else we generate pseudo MWAITX_SAVE_RBX.

37672

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

37673

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

37674

.addReg(MI.getOperand(0).getReg());

37675

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

37676

.addReg(MI.getOperand(1).getReg());

37677

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

37678

.addReg(MI.getOperand(2).getReg());

37679

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

37680

MI.eraseFromParent();

37681

} else {

37682

if (!BB->isLiveIn(BasePtr)) {

37683

BB->addLiveIn(BasePtr);

37684

}

37685

// Parameters can be copied into ECX and EAX but not EBX yet.

37686

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

37687

.addReg(MI.getOperand(0).getReg());

37688

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

37689

.addReg(MI.getOperand(1).getReg());

37690

assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37690, __extension__
__PRETTY_FUNCTION__));

37691

// Save RBX into a virtual register.

37692

Register SaveRBX =

37693

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37694

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

37695

.addReg(X86::RBX);

37696

// Generate mwaitx pseudo.

37697

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37698

BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))

37699

.addDef(Dst) // Destination tied in with SaveRBX.

37700

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

37701

.addUse(SaveRBX); // Save of base pointer.

37702

MI.eraseFromParent();

37703

}

37704

return BB;

37705

}

37706

case TargetOpcode::PREALLOCATED_SETUP: {

37707

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37707, __extension__
__PRETTY_FUNCTION__));

37708

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

37709

MFI->setHasPreallocatedCall(true);

37710

int64_t PreallocatedId = MI.getOperand(0).getImm();

37711

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

37712

assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37712, __extension__
__PRETTY_FUNCTION__));

37713

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

37714

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

37715

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

37716

.addReg(X86::ESP)

37717

.addImm(StackAdjustment);

37718

MI.eraseFromParent();

37719

return BB;

37720

}

37721

case TargetOpcode::PREALLOCATED_ARG: {

37722

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37722, __extension__
__PRETTY_FUNCTION__));

37723

int64_t PreallocatedId = MI.getOperand(1).getImm();

37724

int64_t ArgIdx = MI.getOperand(2).getImm();

37725

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

37726

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

37727

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

37728

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

37729

// stack pointer + offset

37730

addRegOffset(

37731

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

37732

X86::ESP, false, ArgOffset);

37733

MI.eraseFromParent();

37734

return BB;

37735

}

37736

case X86::PTDPBSSD:

37737

case X86::PTDPBSUD:

37738

case X86::PTDPBUSD:

37739

case X86::PTDPBUUD:

37740

case X86::PTDPBF16PS:

37741

case X86::PTDPFP16PS: {

37742

unsigned Opc;

37743

switch (MI.getOpcode()) {

37744

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37744);

37745

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

37746

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

37747

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

37748

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

37749

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

37750

case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;

37751

}

37752

37753

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

37754

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

37755

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

37756

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

37757

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

37758

37759

MI.eraseFromParent(); // The pseudo is gone now.

37760

return BB;

37761

}

37762

case X86::PTILEZERO: {

37763

unsigned Imm = MI.getOperand(0).getImm();

37764

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

37765

MI.eraseFromParent(); // The pseudo is gone now.

37766

return BB;

37767

}

37768

case X86::PTILELOADD:

37769

case X86::PTILELOADDT1:

37770

case X86::PTILESTORED: {

37771

unsigned Opc;

37772

switch (MI.getOpcode()) {

37773

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37773);

37774

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

37775

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

37776

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

37777

}

37778

37779

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

37780

unsigned CurOp = 0;

37781

if (Opc != X86::TILESTORED)

37782

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

37783

RegState::Define);

37784

37785

MIB.add(MI.getOperand(CurOp++)); // base

37786

MIB.add(MI.getOperand(CurOp++)); // scale

37787

MIB.add(MI.getOperand(CurOp++)); // index -- stride

37788

MIB.add(MI.getOperand(CurOp++)); // displacement

37789

MIB.add(MI.getOperand(CurOp++)); // segment

37790

37791

if (Opc == X86::TILESTORED)

37792

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

37793

RegState::Undef);

37794

37795

MI.eraseFromParent(); // The pseudo is gone now.

37796

return BB;

37797

}

37798

}

37799

}

37800

37801

//===----------------------------------------------------------------------===//

37802

// X86 Optimization Hooks

37803

//===----------------------------------------------------------------------===//

37804

37805

bool

37806

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

37807

const APInt &DemandedBits,

37808

const APInt &DemandedElts,

37809

TargetLoweringOpt &TLO) const {

37810

EVT VT = Op.getValueType();

37811

unsigned Opcode = Op.getOpcode();

37812

unsigned EltSize = VT.getScalarSizeInBits();

37813

37814

if (VT.isVector()) {

37815

// If the constant is only all signbits in the active bits, then we should

37816

// extend it to the entire constant to allow it act as a boolean constant

37817

// vector.

37818

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

37819

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

37820

return false;

37821

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

37822

if (!DemandedElts[i] || V.getOperand(i).isUndef())

37823

continue;

37824

const APInt &Val = V.getConstantOperandAPInt(i);

37825

if (Val.getBitWidth() > Val.getNumSignBits() &&

37826

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

37827

return true;

37828

}

37829

return false;

37830

};

37831

// For vectors - if we have a constant, then try to sign extend.

37832

// TODO: Handle AND/ANDN cases.

37833

unsigned ActiveBits = DemandedBits.getActiveBits();

37834

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

37835

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

37836

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

37837

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

37838

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

37839

VT.getVectorNumElements());

37840

SDValue NewC =

37841

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

37842

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

37843

SDValue NewOp =

37844

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

37845

return TLO.CombineTo(Op, NewOp);

37846

}

37847

return false;

37848

}

37849

37850

// Only optimize Ands to prevent shrinking a constant that could be

37851

// matched by movzx.

37852

if (Opcode != ISD::AND)

37853

return false;

37854

37855

// Make sure the RHS really is a constant.

37856

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

37857

if (!C)

37858

return false;

37859

37860

const APInt &Mask = C->getAPIntValue();

37861

37862

// Clear all non-demanded bits initially.

37863

APInt ShrunkMask = Mask & DemandedBits;

37864

37865

// Find the width of the shrunk mask.

37866

unsigned Width = ShrunkMask.getActiveBits();

37867

37868

// If the mask is all 0s there's nothing to do here.

37869

if (Width == 0)

37870

return false;

37871

37872

// Find the next power of 2 width, rounding up to a byte.

37873

Width = llvm::bit_ceil(std::max(Width, 8U));

37874

// Truncate the width to size to handle illegal types.

37875

Width = std::min(Width, EltSize);

37876

37877

// Calculate a possible zero extend mask for this constant.

37878

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

37879

37880

// If we aren't changing the mask, just return true to keep it and prevent

37881

// the caller from optimizing.

37882

if (ZeroExtendMask == Mask)

37883

return true;

37884

37885

// Make sure the new mask can be represented by a combination of mask bits

37886

// and non-demanded bits.

37887

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

37888

return false;

37889

37890

// Replace the constant with the zero extend mask.

37891

SDLoc DL(Op);

37892

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

37893

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

37894

return TLO.CombineTo(Op, NewOp);

37895

}

37896

37897

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

37898

KnownBits &Known,

37899

const APInt &DemandedElts,

37900

const SelectionDAG &DAG,

37901

unsigned Depth) const {

37902

unsigned BitWidth = Known.getBitWidth();

37903

unsigned NumElts = DemandedElts.getBitWidth();

37904

unsigned Opc = Op.getOpcode();

37905

EVT VT = Op.getValueType();

37906

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37911, __extension__
__PRETTY_FUNCTION__))

37907

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37911, __extension__
__PRETTY_FUNCTION__))

37908

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37911, __extension__
__PRETTY_FUNCTION__))

37909

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37911, __extension__
__PRETTY_FUNCTION__))

37910

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37911, __extension__
__PRETTY_FUNCTION__))

37911

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37911, __extension__
__PRETTY_FUNCTION__));

37912

37913

Known.resetAll();

37914

switch (Opc) {

37915

default: break;

37916

case X86ISD::SETCC:

37917

Known.Zero.setBitsFrom(1);

37918

break;

37919

case X86ISD::MOVMSK: {

37920

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

37921

Known.Zero.setBitsFrom(NumLoBits);

37922

break;

37923

}

37924

case X86ISD::PEXTRB:

37925

case X86ISD::PEXTRW: {

37926

SDValue Src = Op.getOperand(0);

37927

EVT SrcVT = Src.getValueType();

37928

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

37929

Op.getConstantOperandVal(1));

37930

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

37931

Known = Known.anyextOrTrunc(BitWidth);

37932

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

37933

break;

37934

}

37935

case X86ISD::VSRAI:

37936

case X86ISD::VSHLI:

37937

case X86ISD::VSRLI: {

37938

unsigned ShAmt = Op.getConstantOperandVal(1);

37939

if (ShAmt >= VT.getScalarSizeInBits()) {

37940

// Out of range logical bit shifts are guaranteed to be zero.

37941

// Out of range arithmetic bit shifts splat the sign bit.

37942

if (Opc != X86ISD::VSRAI) {

37943

Known.setAllZero();

37944

break;

37945

}

37946

37947

ShAmt = VT.getScalarSizeInBits() - 1;

37948

}

37949

37950

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37951

if (Opc == X86ISD::VSHLI) {

37952

Known.Zero <<= ShAmt;

37953

Known.One <<= ShAmt;

37954

// Low bits are known zero.

37955

Known.Zero.setLowBits(ShAmt);

37956

} else if (Opc == X86ISD::VSRLI) {

37957

Known.Zero.lshrInPlace(ShAmt);

37958

Known.One.lshrInPlace(ShAmt);

37959

// High bits are known zero.

37960

Known.Zero.setHighBits(ShAmt);

37961

} else {

37962

Known.Zero.ashrInPlace(ShAmt);

37963

Known.One.ashrInPlace(ShAmt);

37964

}

37965

break;

37966

}

37967

case X86ISD::PACKUS: {

37968

// PACKUS is just a truncation if the upper half is zero.

37969

APInt DemandedLHS, DemandedRHS;

37970

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

37971

37972

Known.One = APInt::getAllOnes(BitWidth * 2);

37973

Known.Zero = APInt::getAllOnes(BitWidth * 2);

37974

37975

KnownBits Known2;

37976

if (!!DemandedLHS) {

37977

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

37978

Known = KnownBits::commonBits(Known, Known2);

37979

}

37980

if (!!DemandedRHS) {

37981

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

37982

Known = KnownBits::commonBits(Known, Known2);

37983

}

37984

37985

if (Known.countMinLeadingZeros() < BitWidth)

37986

Known.resetAll();

37987

Known = Known.trunc(BitWidth);

37988

break;

37989

}

37990

case X86ISD::VBROADCAST: {

37991

SDValue Src = Op.getOperand(0);

37992

if (!Src.getSimpleValueType().isVector()) {

37993

Known = DAG.computeKnownBits(Src, Depth + 1);

37994

return;

37995

}

37996

break;

37997

}

37998

case X86ISD::AND: {

37999

if (Op.getResNo() == 0) {

38000

KnownBits Known2;

38001

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38002

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38003

Known &= Known2;

38004

}

38005

break;

38006

}

38007

case X86ISD::ANDNP: {

38008

KnownBits Known2;

38009

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38010

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38011

38012

// ANDNP = (~X & Y);

38013

Known.One &= Known2.Zero;

38014

Known.Zero |= Known2.One;

38015

break;

38016

}

38017

case X86ISD::FOR: {

38018

KnownBits Known2;

38019

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38020

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38021

38022

Known |= Known2;

38023

break;

38024

}

38025

case X86ISD::PSADBW: {

38026

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38028, __extension__
__PRETTY_FUNCTION__))

38027

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38028, __extension__
__PRETTY_FUNCTION__))

38028

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38028, __extension__
__PRETTY_FUNCTION__));

38029

38030

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

38031

Known.Zero.setBitsFrom(16);

38032

break;

38033

}

38034

case X86ISD::PMULUDQ: {

38035

KnownBits Known2;

38036

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38037

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38038

38039

Known = Known.trunc(BitWidth / 2).zext(BitWidth);

38040

Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

38041

Known = KnownBits::mul(Known, Known2);

38042

break;

38043

}

38044

case X86ISD::CMOV: {

38045

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

38046

// If we don't know any bits, early out.

38047

if (Known.isUnknown())

38048

break;

38049

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

38050

38051

// Only known if known in both the LHS and RHS.

38052

Known = KnownBits::commonBits(Known, Known2);

38053

break;

38054

}

38055

case X86ISD::BEXTR:

38056

case X86ISD::BEXTRI: {

38057

SDValue Op0 = Op.getOperand(0);

38058

SDValue Op1 = Op.getOperand(1);

38059

38060

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

38061

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

38062

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

38063

38064

// If the length is 0, the result is 0.

38065

if (Length == 0) {

38066

Known.setAllZero();

38067

break;

38068

}

38069

38070

if ((Shift + Length) <= BitWidth) {

38071

Known = DAG.computeKnownBits(Op0, Depth + 1);

38072

Known = Known.extractBits(Length, Shift);

38073

Known = Known.zextOrTrunc(BitWidth);

38074

}

38075

}

38076

break;

38077

}

38078

case X86ISD::PDEP: {

38079

KnownBits Known2;

38080

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38081

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38082

// Zeros are retained from the mask operand. But not ones.

38083

Known.One.clearAllBits();

38084

// The result will have at least as many trailing zeros as the non-mask

38085

// operand since bits can only map to the same or higher bit position.

38086

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

38087

break;

38088

}

38089

case X86ISD::PEXT: {

38090

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38091

// The result has as many leading zeros as the number of zeroes in the mask.

38092

unsigned Count = Known.Zero.popcount();

38093

Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

38094

Known.One.clearAllBits();

38095

break;

38096

}

38097

case X86ISD::VTRUNC:

38098

case X86ISD::VTRUNCS:

38099

case X86ISD::VTRUNCUS:

38100

case X86ISD::CVTSI2P:

38101

case X86ISD::CVTUI2P:

38102

case X86ISD::CVTP2SI:

38103

case X86ISD::CVTP2UI:

38104

case X86ISD::MCVTP2SI:

38105

case X86ISD::MCVTP2UI:

38106

case X86ISD::CVTTP2SI:

38107

case X86ISD::CVTTP2UI:

38108

case X86ISD::MCVTTP2SI:

38109

case X86ISD::MCVTTP2UI:

38110

case X86ISD::MCVTSI2P:

38111

case X86ISD::MCVTUI2P:

38112

case X86ISD::VFPROUND:

38113

case X86ISD::VMFPROUND:

38114

case X86ISD::CVTPS2PH:

38115

case X86ISD::MCVTPS2PH: {

38116

// Truncations/Conversions - upper elements are known zero.

38117

EVT SrcVT = Op.getOperand(0).getValueType();

38118

if (SrcVT.isVector()) {

38119

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38120

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38121

Known.setAllZero();

38122

}

38123

break;

38124

}

38125

case X86ISD::STRICT_CVTTP2SI:

38126

case X86ISD::STRICT_CVTTP2UI:

38127

case X86ISD::STRICT_CVTSI2P:

38128

case X86ISD::STRICT_CVTUI2P:

38129

case X86ISD::STRICT_VFPROUND:

38130

case X86ISD::STRICT_CVTPS2PH: {

38131

// Strict Conversions - upper elements are known zero.

38132

EVT SrcVT = Op.getOperand(1).getValueType();

38133

if (SrcVT.isVector()) {

38134

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38135

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38136

Known.setAllZero();

38137

}

38138

break;

38139

}

38140

case X86ISD::MOVQ2DQ: {

38141

// Move from MMX to XMM. Upper half of XMM should be 0.

38142

if (DemandedElts.countr_zero() >= (NumElts / 2))

38143

Known.setAllZero();

38144

break;

38145

}

38146

case X86ISD::VBROADCAST_LOAD: {

38147

APInt UndefElts;

38148

SmallVector<APInt, 16> EltBits;

38149

if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,

38150

/*AllowWholeUndefs*/ false,

38151

/*AllowPartialUndefs*/ false)) {

38152

Known.Zero.setAllBits();

38153

Known.One.setAllBits();

38154

for (unsigned I = 0; I != NumElts; ++I) {

38155

if (!DemandedElts[I])

38156

continue;

38157

if (UndefElts[I]) {

38158

Known.resetAll();

38159

break;

38160

}

38161

KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);

38162

Known = KnownBits::commonBits(Known, Known2);

38163

}

38164

return;

38165

}

38166

break;

38167

}

38168

}

38169

38170

// Handle target shuffles.

38171

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38172

if (isTargetShuffle(Opc)) {

38173

SmallVector<int, 64> Mask;

38174

SmallVector<SDValue, 2> Ops;

38175

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38176

unsigned NumOps = Ops.size();

38177

unsigned NumElts = VT.getVectorNumElements();

38178

if (Mask.size() == NumElts) {

38179

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38180

Known.Zero.setAllBits(); Known.One.setAllBits();

38181

for (unsigned i = 0; i != NumElts; ++i) {

38182

if (!DemandedElts[i])

38183

continue;

38184

int M = Mask[i];

38185

if (M == SM_SentinelUndef) {

38186

// For UNDEF elements, we don't know anything about the common state

38187

// of the shuffle result.

38188

Known.resetAll();

38189

break;

38190

}

38191

if (M == SM_SentinelZero) {

38192

Known.One.clearAllBits();

38193

continue;

38194

}

38195

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38196, __extension__
__PRETTY_FUNCTION__))

38196

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38196, __extension__
__PRETTY_FUNCTION__));

38197

38198

unsigned OpIdx = (unsigned)M / NumElts;

38199

unsigned EltIdx = (unsigned)M % NumElts;

38200

if (Ops[OpIdx].getValueType() != VT) {

38201

// TODO - handle target shuffle ops with different value types.

38202

Known.resetAll();

38203

break;

38204

}

38205

DemandedOps[OpIdx].setBit(EltIdx);

38206

}

38207

// Known bits are the values that are shared by every demanded element.

38208

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

38209

if (!DemandedOps[i])

38210

continue;

38211

KnownBits Known2 =

38212

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

38213

Known = KnownBits::commonBits(Known, Known2);

38214

}

38215

}

38216

}

38217

}

38218

}

38219

38220

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

38221

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

38222

unsigned Depth) const {

38223

EVT VT = Op.getValueType();

38224

unsigned VTBits = VT.getScalarSizeInBits();

38225

unsigned Opcode = Op.getOpcode();

38226

switch (Opcode) {

38227

case X86ISD::SETCC_CARRY:

38228

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

38229

return VTBits;

38230

38231

case X86ISD::VTRUNC: {

38232

SDValue Src = Op.getOperand(0);

38233

MVT SrcVT = Src.getSimpleValueType();

38234

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

38235

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38235, __extension__
__PRETTY_FUNCTION__));

38236

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

38237

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

38238

if (Tmp > (NumSrcBits - VTBits))

38239

return Tmp - (NumSrcBits - VTBits);

38240

return 1;

38241

}

38242

38243

case X86ISD::PACKSS: {

38244

// PACKSS is just a truncation if the sign bits extend to the packed size.

38245

APInt DemandedLHS, DemandedRHS;

38246

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

38247

DemandedRHS);

38248

38249

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

38250

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

38251

if (!!DemandedLHS)

38252

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38253

if (!!DemandedRHS)

38254

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38255

unsigned Tmp = std::min(Tmp0, Tmp1);

38256

if (Tmp > (SrcBits - VTBits))

38257

return Tmp - (SrcBits - VTBits);

38258

return 1;

38259

}

38260

38261

case X86ISD::VBROADCAST: {

38262

SDValue Src = Op.getOperand(0);

38263

if (!Src.getSimpleValueType().isVector())

38264

return DAG.ComputeNumSignBits(Src, Depth + 1);

38265

break;

38266

}

38267

38268

case X86ISD::VSHLI: {

38269

SDValue Src = Op.getOperand(0);

38270

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

38271

if (ShiftVal.uge(VTBits))

38272

return VTBits; // Shifted all bits out --> zero.

38273

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38274

if (ShiftVal.uge(Tmp))

38275

return 1; // Shifted all sign bits out --> unknown.

38276

return Tmp - ShiftVal.getZExtValue();

38277

}

38278

38279

case X86ISD::VSRAI: {

38280

SDValue Src = Op.getOperand(0);

38281

APInt ShiftVal = Op.getConstantOperandAPInt(1);

38282

if (ShiftVal.uge(VTBits - 1))

38283

return VTBits; // Sign splat.

38284

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38285

ShiftVal += Tmp;

38286

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

38287

}

38288

38289

case X86ISD::FSETCC:

38290

// cmpss/cmpsd return zero/all-bits result values in the bottom element.

38291

if (VT == MVT::f32 || VT == MVT::f64 ||

38292

((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

38293

return VTBits;

38294

break;

38295

38296

case X86ISD::PCMPGT:

38297

case X86ISD::PCMPEQ:

38298

case X86ISD::CMPP:

38299

case X86ISD::VPCOM:

38300

case X86ISD::VPCOMU:

38301

// Vector compares return zero/all-bits result values.

38302

return VTBits;

38303

38304

case X86ISD::ANDNP: {

38305

unsigned Tmp0 =

38306

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

38307

if (Tmp0 == 1) return 1; // Early out.

38308

unsigned Tmp1 =

38309

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

38310

return std::min(Tmp0, Tmp1);

38311

}

38312

38313

case X86ISD::CMOV: {

38314

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

38315

if (Tmp0 == 1) return 1; // Early out.

38316

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

38317

return std::min(Tmp0, Tmp1);

38318

}

38319

}

38320

38321

// Handle target shuffles.

38322

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38323

if (isTargetShuffle(Opcode)) {

38324

SmallVector<int, 64> Mask;

38325

SmallVector<SDValue, 2> Ops;

38326

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38327

unsigned NumOps = Ops.size();

38328

unsigned NumElts = VT.getVectorNumElements();

38329

if (Mask.size() == NumElts) {

38330

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38331

for (unsigned i = 0; i != NumElts; ++i) {

38332

if (!DemandedElts[i])

38333

continue;

38334

int M = Mask[i];

38335

if (M == SM_SentinelUndef) {

38336

// For UNDEF elements, we don't know anything about the common state

38337

// of the shuffle result.

38338

return 1;

38339

} else if (M == SM_SentinelZero) {

38340

// Zero = all sign bits.

38341

continue;

38342

}

38343

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38344, __extension__
__PRETTY_FUNCTION__))

38344

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38344, __extension__
__PRETTY_FUNCTION__));

38345

38346

unsigned OpIdx = (unsigned)M / NumElts;

38347

unsigned EltIdx = (unsigned)M % NumElts;

38348

if (Ops[OpIdx].getValueType() != VT) {

38349

// TODO - handle target shuffle ops with different value types.

38350

return 1;

38351

}

38352

DemandedOps[OpIdx].setBit(EltIdx);

38353

}

38354

unsigned Tmp0 = VTBits;

38355

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

38356

if (!DemandedOps[i])

38357

continue;

38358

unsigned Tmp1 =

38359

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

38360

Tmp0 = std::min(Tmp0, Tmp1);

38361

}

38362

return Tmp0;

38363

}

38364

}

38365

}

38366

38367

// Fallback case.

38368

return 1;

38369

}

38370

38371

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

38372

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

38373

return N->getOperand(0);

38374

return N;

38375

}

38376

38377

// Helper to look for a normal load that can be narrowed into a vzload with the

38378

// specified VT and memory VT. Returns SDValue() on failure.

38379

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

38380

SelectionDAG &DAG) {

38381

// Can't if the load is volatile or atomic.

38382

if (!LN->isSimple())

38383

return SDValue();

38384

38385

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

38386

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

38387

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

38388

LN->getPointerInfo(), LN->getOriginalAlign(),

38389

LN->getMemOperand()->getFlags());

38390

}

38391

38392

// Attempt to match a combined shuffle mask against supported unary shuffle

38393

// instructions.

38394

// TODO: Investigate sharing more of this with shuffle lowering.

38395

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

38396

bool AllowFloatDomain, bool AllowIntDomain,

38397

SDValue V1, const SelectionDAG &DAG,

38398

const X86Subtarget &Subtarget, unsigned &Shuffle,

38399

MVT &SrcVT, MVT &DstVT) {

38400

unsigned NumMaskElts = Mask.size();

38401

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

38402

38403

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

38404

if (Mask[0] == 0 &&

38405

(MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

38406

if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

38407

(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

38408

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

38409

Shuffle = X86ISD::VZEXT_MOVL;

38410

if (MaskEltSize == 16)

38411

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38412

else

38413

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38414

return true;

38415

}

38416

}

38417

38418

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

38419

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

38420

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

38421

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

38422

unsigned MaxScale = 64 / MaskEltSize;

38423

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

38424

bool MatchAny = true;

38425

bool MatchZero = true;

38426

unsigned NumDstElts = NumMaskElts / Scale;

38427

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

38428

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

38429

MatchAny = MatchZero = false;

38430

break;

38431

}

38432

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

38433

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

38434

}

38435

if (MatchAny || MatchZero) {

38436

assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38436, __extension__
__PRETTY_FUNCTION__));

38437

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

38438

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

38439

MVT::getIntegerVT(MaskEltSize);

38440

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

38441

38442

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

38443

if (SrcVT.getVectorNumElements() != NumDstElts)

38444

Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);

38445

38446

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

38447

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

38448

return true;

38449

}

38450

}

38451

}

38452

38453

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

38454

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

38455

(MaskEltSize == 16 && Subtarget.hasFP16())) &&

38456

isUndefOrEqual(Mask[0], 0) &&

38457

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

38458

Shuffle = X86ISD::VZEXT_MOVL;

38459

if (MaskEltSize == 16)

38460

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38461

else

38462

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38463

return true;

38464

}

38465

38466

// Check if we have SSE3 which will let us use MOVDDUP etc. The

38467

// instructions are no slower than UNPCKLPD but has the option to

38468

// fold the input operand into even an unaligned memory load.

38469

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

38470

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {

38471

Shuffle = X86ISD::MOVDDUP;

38472

SrcVT = DstVT = MVT::v2f64;

38473

return true;

38474

}

38475

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

38476

Shuffle = X86ISD::MOVSLDUP;

38477

SrcVT = DstVT = MVT::v4f32;

38478

return true;

38479

}

38480

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {

38481

Shuffle = X86ISD::MOVSHDUP;

38482

SrcVT = DstVT = MVT::v4f32;

38483

return true;

38484

}

38485

}

38486

38487

if (MaskVT.is256BitVector() && AllowFloatDomain) {

38488

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38488, __extension__
__PRETTY_FUNCTION__));

38489

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

38490

Shuffle = X86ISD::MOVDDUP;

38491

SrcVT = DstVT = MVT::v4f64;

38492

return true;

38493

}

38494

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

38495

V1)) {

38496

Shuffle = X86ISD::MOVSLDUP;

38497

SrcVT = DstVT = MVT::v8f32;

38498

return true;

38499

}

38500

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,

38501

V1)) {

38502

Shuffle = X86ISD::MOVSHDUP;

38503

SrcVT = DstVT = MVT::v8f32;

38504

return true;

38505

}

38506

}

38507

38508

if (MaskVT.is512BitVector() && AllowFloatDomain) {

38509

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38510, __extension__
__PRETTY_FUNCTION__))

38510

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38510, __extension__
__PRETTY_FUNCTION__));

38511

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

38512

V1)) {

38513

Shuffle = X86ISD::MOVDDUP;

38514

SrcVT = DstVT = MVT::v8f64;

38515

return true;

38516

}

38517

if (isTargetShuffleEquivalent(

38518

MaskVT, Mask,

38519

{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {

38520

Shuffle = X86ISD::MOVSLDUP;

38521

SrcVT = DstVT = MVT::v16f32;

38522

return true;

38523

}

38524

if (isTargetShuffleEquivalent(

38525

MaskVT, Mask,

38526

{1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {

38527

Shuffle = X86ISD::MOVSHDUP;

38528

SrcVT = DstVT = MVT::v16f32;

38529

return true;

38530

}

38531

}

38532

38533

return false;

38534

}

38535

38536

// Attempt to match a combined shuffle mask against supported unary immediate

38537

// permute instructions.

38538

// TODO: Investigate sharing more of this with shuffle lowering.

38539

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

38540

const APInt &Zeroable,

38541

bool AllowFloatDomain, bool AllowIntDomain,

38542

const SelectionDAG &DAG,

38543

const X86Subtarget &Subtarget,

38544

unsigned &Shuffle, MVT &ShuffleVT,

38545

unsigned &PermuteImm) {

38546

unsigned NumMaskElts = Mask.size();

38547

unsigned InputSizeInBits = MaskVT.getSizeInBits();

38548

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

38549

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

38550

bool ContainsZeros = isAnyZero(Mask);

38551

38552

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

38553

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

38554

// Check for lane crossing permutes.

38555

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

38556

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

38557

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

38558

Shuffle = X86ISD::VPERMI;

38559

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

38560

PermuteImm = getV4X86ShuffleImm(Mask);

38561

return true;

38562

}

38563

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

38564

SmallVector<int, 4> RepeatedMask;

38565

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

38566

Shuffle = X86ISD::VPERMI;

38567

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

38568

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

38569

return true;

38570

}

38571

}

38572

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

38573

// VPERMILPD can permute with a non-repeating shuffle.

38574

Shuffle = X86ISD::VPERMILPI;

38575

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

38576

PermuteImm = 0;

38577

for (int i = 0, e = Mask.size(); i != e; ++i) {

38578

int M = Mask[i];

38579

if (M == SM_SentinelUndef)

38580

continue;

38581

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38581, __extension__
__PRETTY_FUNCTION__));

38582

PermuteImm |= (M & 1) << i;

38583

}

38584

return true;

38585

}

38586

}

38587

38588

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

38589

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

38590

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

38591

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

38592

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

38593

SmallVector<int, 4> RepeatedMask;

38594

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

38595

// Narrow the repeated mask to create 32-bit element permutes.

38596

SmallVector<int, 4> WordMask = RepeatedMask;

38597

if (MaskScalarSizeInBits == 64)

38598

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

38599

38600

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

38601

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

38602

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

38603

PermuteImm = getV4X86ShuffleImm(WordMask);

38604

return true;

38605

}

38606

}

38607

38608

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

38609

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

38610

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38611

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38612

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

38613

SmallVector<int, 4> RepeatedMask;

38614

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

38615

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

38616

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

38617

38618

// PSHUFLW: permute lower 4 elements only.

38619

if (isUndefOrInRange(LoMask, 0, 4) &&

38620

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

38621

Shuffle = X86ISD::PSHUFLW;

38622

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

38623

PermuteImm = getV4X86ShuffleImm(LoMask);

38624

return true;

38625

}

38626

38627

// PSHUFHW: permute upper 4 elements only.

38628

if (isUndefOrInRange(HiMask, 4, 8) &&

38629

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

38630

// Offset the HiMask so that we can create the shuffle immediate.

38631

int OffsetHiMask[4];

38632

for (int i = 0; i != 4; ++i)

38633

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

38634

38635

Shuffle = X86ISD::PSHUFHW;

38636

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

38637

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

38638

return true;

38639

}

38640

}

38641

}

38642

38643

// Attempt to match against byte/bit shifts.

38644

if (AllowIntDomain &&

38645

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38646

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38647

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38648

int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,

38649

Mask, 0, Zeroable, Subtarget);

38650

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

38651

32 <= ShuffleVT.getScalarSizeInBits())) {

38652

PermuteImm = (unsigned)ShiftAmt;

38653

return true;

38654

}

38655

}

38656

38657

// Attempt to match against bit rotates.

38658

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

38659

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

38660

Subtarget.hasAVX512())) {

38661

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

38662

Subtarget, Mask);

38663

if (0 < RotateAmt) {

38664

Shuffle = X86ISD::VROTLI;

38665

PermuteImm = (unsigned)RotateAmt;

38666

return true;

38667

}

38668

}

38669

38670

return false;

38671

}

38672

38673

// Attempt to match a combined unary shuffle mask against supported binary

38674

// shuffle instructions.

38675

// TODO: Investigate sharing more of this with shuffle lowering.

38676

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

38677

bool AllowFloatDomain, bool AllowIntDomain,

38678

SDValue &V1, SDValue &V2, const SDLoc &DL,

38679

SelectionDAG &DAG, const X86Subtarget &Subtarget,

38680

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

38681

bool IsUnary) {

38682

unsigned NumMaskElts = Mask.size();

38683

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

38684

unsigned SizeInBits = MaskVT.getSizeInBits();

38685

38686

if (MaskVT.is128BitVector()) {

38687

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&

38688

AllowFloatDomain) {

38689

V2 = V1;

38690

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

38691

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

38692

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

38693

return true;

38694

}

38695

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&

38696

AllowFloatDomain) {

38697

V2 = V1;

38698

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

38699

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

38700

return true;

38701

}

38702

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&

38703

Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

38704

std::swap(V1, V2);

38705

Shuffle = X86ISD::MOVSD;

38706

SrcVT = DstVT = MVT::v2f64;

38707

return true;

38708

}

38709

if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&

38710

(AllowFloatDomain || !Subtarget.hasSSE41())) {

38711

Shuffle = X86ISD::MOVSS;

38712

SrcVT = DstVT = MVT::v4f32;

38713

return true;

38714

}

38715

if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},

38716

DAG) &&

38717

Subtarget.hasFP16()) {

38718

Shuffle = X86ISD::MOVSH;

38719

SrcVT = DstVT = MVT::v8f16;

38720

return true;

38721

}

38722

}

38723

38724

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

38725

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

38726

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

38727

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

38728

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

38729

Subtarget)) {

38730

DstVT = MaskVT;

38731

return true;

38732

}

38733

}

38734

38735

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

38736

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

38737

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38738

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

38739

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38740

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

38741

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

38742

Subtarget)) {

38743

SrcVT = DstVT = MaskVT;

38744

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

38745

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

38746

return true;

38747

}

38748

}

38749

38750

// Attempt to match against a OR if we're performing a blend shuffle and the

38751

// non-blended source element is zero in each case.

38752

// TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.

38753

if (SizeInBits == V1.getValueSizeInBits() &&

38754

SizeInBits == V2.getValueSizeInBits() &&

38755

(EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

38756

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

38757

bool IsBlend = true;

38758

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

38759

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

38760

unsigned Scale1 = NumV1Elts / NumMaskElts;

38761

unsigned Scale2 = NumV2Elts / NumMaskElts;

38762

APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

38763

APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

38764

for (unsigned i = 0; i != NumMaskElts; ++i) {

38765

int M = Mask[i];

38766

if (M == SM_SentinelUndef)

38767

continue;

38768

if (M == SM_SentinelZero) {

38769

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

38770

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

38771

continue;

38772

}

38773

if (M == (int)i) {

38774

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

38775

continue;

38776

}

38777

if (M == (int)(i + NumMaskElts)) {

38778

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

38779

continue;

38780

}

38781

IsBlend = false;

38782

break;

38783

}

38784

if (IsBlend) {

38785

if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&

38786

DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {

38787

Shuffle = ISD::OR;

38788

SrcVT = DstVT = MaskVT.changeTypeToInteger();

38789

return true;

38790

}

38791

if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

38792

// FIXME: handle mismatched sizes?

38793

// TODO: investigate if `ISD::OR` handling in

38794

// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

38795

auto computeKnownBitsElementWise = [&DAG](SDValue V) {

38796

unsigned NumElts = V.getValueType().getVectorNumElements();

38797

KnownBits Known(NumElts);

38798

for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

38799

APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

38800

KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

38801

if (PeepholeKnown.isZero())

38802

Known.Zero.setBit(EltIdx);

38803

if (PeepholeKnown.isAllOnes())

38804

Known.One.setBit(EltIdx);

38805

}

38806

return Known;

38807

};

38808

38809

KnownBits V1Known = computeKnownBitsElementWise(V1);

38810

KnownBits V2Known = computeKnownBitsElementWise(V2);

38811

38812

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

38813

int M = Mask[i];

38814

if (M == SM_SentinelUndef)

38815

continue;

38816

if (M == SM_SentinelZero) {

38817

IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

38818

continue;

38819

}

38820

if (M == (int)i) {

38821

IsBlend &= V2Known.Zero[i] || V1Known.One[i];

38822

continue;

38823

}

38824

if (M == (int)(i + NumMaskElts)) {

38825

IsBlend &= V1Known.Zero[i] || V2Known.One[i];

38826

continue;

38827

}

38828

llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38828);

38829

}

38830

if (IsBlend) {

38831

Shuffle = ISD::OR;

38832

SrcVT = DstVT = MaskVT.changeTypeToInteger();

38833

return true;

38834

}

38835

}

38836

}

38837

}

38838

38839

return false;

38840

}

38841

38842

static bool matchBinaryPermuteShuffle(

38843

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

38844

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

38845

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

38846

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

38847

unsigned NumMaskElts = Mask.size();

38848

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

38849

38850

// Attempt to match against VALIGND/VALIGNQ rotate.

38851

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

38852

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

38853

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

38854

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38855

if (!isAnyZero(Mask)) {

38856

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

38857

if (0 < Rotation) {

38858

Shuffle = X86ISD::VALIGN;

38859

if (EltSizeInBits == 64)

38860

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

38861

else

38862

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

38863

PermuteImm = Rotation;

38864

return true;

38865

}

38866

}

38867

}

38868

38869

// Attempt to match against PALIGNR byte rotate.

38870

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

38871

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38872

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

38873

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

38874

if (0 < ByteRotation) {

38875

Shuffle = X86ISD::PALIGNR;

38876

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

38877

PermuteImm = ByteRotation;

38878

return true;

38879

}

38880

}

38881

38882

// Attempt to combine to X86ISD::BLENDI.

38883

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

38884

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

38885

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

38886

uint64_t BlendMask = 0;

38887

bool ForceV1Zero = false, ForceV2Zero = false;

38888

SmallVector<int, 8> TargetMask(Mask);

38889

if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,

38890

ForceV2Zero, BlendMask)) {

38891

if (MaskVT == MVT::v16i16) {

38892

// We can only use v16i16 PBLENDW if the lanes are repeated.

38893

SmallVector<int, 8> RepeatedMask;

38894

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

38895

RepeatedMask)) {

38896

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38897, __extension__
__PRETTY_FUNCTION__))

38897

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38897, __extension__
__PRETTY_FUNCTION__));

38898

PermuteImm = 0;

38899

for (int i = 0; i < 8; ++i)

38900

if (RepeatedMask[i] >= 8)

38901

PermuteImm |= 1 << i;

38902

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

38903

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

38904

Shuffle = X86ISD::BLENDI;

38905

ShuffleVT = MaskVT;

38906

return true;

38907

}

38908

} else {

38909

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

38910

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

38911

PermuteImm = (unsigned)BlendMask;

38912

Shuffle = X86ISD::BLENDI;

38913

ShuffleVT = MaskVT;

38914

return true;

38915

}

38916

}

38917

}

38918

38919

// Attempt to combine to INSERTPS, but only if it has elements that need to

38920

// be set to zero.

38921

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

38922

MaskVT.is128BitVector() && isAnyZero(Mask) &&

38923

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

38924

Shuffle = X86ISD::INSERTPS;

38925

ShuffleVT = MVT::v4f32;

38926

return true;

38927

}

38928

38929

// Attempt to combine to SHUFPD.

38930

if (AllowFloatDomain && EltSizeInBits == 64 &&

38931

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38932

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

38933

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38934

bool ForceV1Zero = false, ForceV2Zero = false;

38935

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

38936

PermuteImm, Mask, Zeroable)) {

38937

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

38938

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

38939

Shuffle = X86ISD::SHUFP;

38940

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

38941

return true;

38942

}

38943

}

38944

38945

// Attempt to combine to SHUFPS.

38946

if (AllowFloatDomain && EltSizeInBits == 32 &&

38947

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

38948

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

38949

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38950

SmallVector<int, 4> RepeatedMask;

38951

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

38952

// Match each half of the repeated mask, to determine if its just

38953

// referencing one of the vectors, is zeroable or entirely undef.

38954

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

38955

int M0 = RepeatedMask[Offset];

38956

int M1 = RepeatedMask[Offset + 1];

38957

38958

if (isUndefInRange(RepeatedMask, Offset, 2)) {

38959

return DAG.getUNDEF(MaskVT);

38960

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

38961

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

38962

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

38963

return getZeroVector(MaskVT, Subtarget, DAG, DL);

38964

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

38965

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

38966

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

38967

return V1;

38968

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

38969

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

38970

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

38971

return V2;

38972

}

38973

38974

return SDValue();

38975

};

38976

38977

int ShufMask[4] = {-1, -1, -1, -1};

38978

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

38979

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

38980

38981

if (Lo && Hi) {

38982

V1 = Lo;

38983

V2 = Hi;

38984

Shuffle = X86ISD::SHUFP;

38985

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

38986

PermuteImm = getV4X86ShuffleImm(ShufMask);

38987

return true;

38988

}

38989

}

38990

}

38991

38992

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

38993

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

38994

MaskVT.is128BitVector() &&

38995

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

38996

Shuffle = X86ISD::INSERTPS;

38997

ShuffleVT = MVT::v4f32;

38998

return true;

38999

}

39000

39001

return false;

39002

}

39003

39004

static SDValue combineX86ShuffleChainWithExtract(

39005

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

39006

bool HasVariableMask, bool AllowVariableCrossLaneMask,

39007

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

39008

const X86Subtarget &Subtarget);

39009

39010

/// Combine an arbitrary chain of shuffles into a single instruction if

39011

/// possible.

39012

///

39013

/// This is the leaf of the recursive combine below. When we have found some

39014

/// chain of single-use x86 shuffle instructions and accumulated the combined

39015

/// shuffle mask represented by them, this will try to pattern match that mask

39016

/// into either a single instruction if there is a special purpose instruction

39017

/// for this operation, or into a PSHUFB instruction which is a fully general

39018

/// instruction but should only be used to replace chains over a certain depth.

39019

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

39020

ArrayRef<int> BaseMask, int Depth,

39021

bool HasVariableMask,

39022

bool AllowVariableCrossLaneMask,

39023

bool AllowVariablePerLaneMask,

39024

SelectionDAG &DAG,

39025

const X86Subtarget &Subtarget) {

39026

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39026, __extension__
__PRETTY_FUNCTION__));

39027

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39028, __extension__
__PRETTY_FUNCTION__))

39028

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39028, __extension__
__PRETTY_FUNCTION__));

39029

39030

SDLoc DL(Root);

39031

MVT RootVT = Root.getSimpleValueType();

39032

unsigned RootSizeInBits = RootVT.getSizeInBits();

39033

unsigned NumRootElts = RootVT.getVectorNumElements();

39034

39035

// Canonicalize shuffle input op to the requested type.

39036

auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

39037

if (VT.getSizeInBits() > Op.getValueSizeInBits())

39038

Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

39039

else if (VT.getSizeInBits() < Op.getValueSizeInBits())

39040

Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

39041

return DAG.getBitcast(VT, Op);

39042

};

39043

39044

// Find the inputs that enter the chain. Note that multiple uses are OK

39045

// here, we're not going to remove the operands we find.

39046

bool UnaryShuffle = (Inputs.size() == 1);

39047

SDValue V1 = peekThroughBitcasts(Inputs[0]);

39048

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

39049

: peekThroughBitcasts(Inputs[1]));

39050

39051

MVT VT1 = V1.getSimpleValueType();

39052

MVT VT2 = V2.getSimpleValueType();

39053

assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39054, __extension__
__PRETTY_FUNCTION__))

39054

(RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39054, __extension__
__PRETTY_FUNCTION__));

39055

39056

SDValue Res;

39057

39058

unsigned NumBaseMaskElts = BaseMask.size();

39059

if (NumBaseMaskElts == 1) {

39060

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39060, __extension__
__PRETTY_FUNCTION__));

39061

return CanonicalizeShuffleInput(RootVT, V1);

39062

}

39063

39064

bool OptForSize = DAG.shouldOptForSize();

39065

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

39066

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

39067

(RootVT.isFloatingPoint() && Depth >= 1) ||

39068

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

39069

39070

// Don't combine if we are a AVX512/EVEX target and the mask element size

39071

// is different from the root element size - this would prevent writemasks

39072

// from being reused.

39073

bool IsMaskedShuffle = false;

39074

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

39075

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

39076

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

39077

IsMaskedShuffle = true;

39078

}

39079

}

39080

39081

// If we are shuffling a splat (and not introducing zeros) then we can just

39082

// use it directly. This works for smaller elements as well as they already

39083

// repeat across each mask element.

39084

if (UnaryShuffle && !isAnyZero(BaseMask) &&

39085

V1.getValueSizeInBits() >= RootSizeInBits &&

39086

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39087

DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {

39088

return CanonicalizeShuffleInput(RootVT, V1);

39089

}

39090

39091

SmallVector<int, 64> Mask(BaseMask);

39092

39093

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs

39094

// etc. can be simplified.

39095

if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

39096

SmallVector<int> ScaledMask, IdentityMask;

39097

unsigned NumElts = VT1.getVectorNumElements();

39098

if (Mask.size() <= NumElts &&

39099

scaleShuffleElements(Mask, NumElts, ScaledMask)) {

39100

for (unsigned i = 0; i != NumElts; ++i)

39101

IdentityMask.push_back(i);

39102

if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,

39103

V2))

39104

return CanonicalizeShuffleInput(RootVT, V1);

39105

}

39106

}

39107

39108

// Handle 128/256-bit lane shuffles of 512-bit vectors.

39109

if (RootVT.is512BitVector() &&

39110

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

39111

// If the upper subvectors are zeroable, then an extract+insert is more

39112

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

39113

// to zero the upper subvectors.

39114

if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

39115

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39116

return SDValue(); // Nothing to do!

39117

assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39118, __extension__
__PRETTY_FUNCTION__))

39118

"Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39118, __extension__
__PRETTY_FUNCTION__));

39119

Res = CanonicalizeShuffleInput(RootVT, V1);

39120

unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

39121

bool UseZero = isAnyZero(Mask);

39122

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

39123

return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

39124

}

39125

39126

// Narrow shuffle mask to v4x128.

39127

SmallVector<int, 4> ScaledMask;

39128

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39128, __extension__
__PRETTY_FUNCTION__));

39129

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);

39130

39131

// Try to lower to vshuf64x2/vshuf32x4.

39132

auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

39133

ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

39134

SelectionDAG &DAG) {

39135

unsigned PermMask = 0;

39136

// Insure elements came from the same Op.

39137

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

39138

for (int i = 0; i < 4; ++i) {

39139

assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39139, __extension__
__PRETTY_FUNCTION__));

39140

if (ScaledMask[i] < 0)

39141

continue;

39142

39143

SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

39144

unsigned OpIndex = i / 2;

39145

if (Ops[OpIndex].isUndef())

39146

Ops[OpIndex] = Op;

39147

else if (Ops[OpIndex] != Op)

39148

return SDValue();

39149

39150

// Convert the 128-bit shuffle mask selection values into 128-bit

39151

// selection bits defined by a vshuf64x2 instruction's immediate control

39152

// byte.

39153

PermMask |= (ScaledMask[i] % 4) << (i * 2);

39154

}

39155

39156

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

39157

CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

39158

CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

39159

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39160

};

39161

39162

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

39163

// doesn't work because our mask is for 128 bits and we don't have an MVT

39164

// to match that.

39165

bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&

39166

isUndefOrInRange(ScaledMask[1], 0, 2) &&

39167

isUndefOrInRange(ScaledMask[2], 2, 4) &&

39168

isUndefOrInRange(ScaledMask[3], 2, 4) &&

39169

(ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

39170

ScaledMask[0] == (ScaledMask[2] % 2)) &&

39171

(ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

39172

ScaledMask[1] == (ScaledMask[3] % 2));

39173

39174

if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

39175

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39176

return SDValue(); // Nothing to do!

39177

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

39178

if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

39179

return DAG.getBitcast(RootVT, V);

39180

}

39181

}

39182

39183

// Handle 128-bit lane shuffles of 256-bit vectors.

39184

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

39185

// If the upper half is zeroable, then an extract+insert is more optimal

39186

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

39187

// zero the upper half.

39188

if (isUndefOrZero(Mask[1])) {

39189

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39190

return SDValue(); // Nothing to do!

39191

assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39191, __extension__
__PRETTY_FUNCTION__));

39192

Res = CanonicalizeShuffleInput(RootVT, V1);

39193

Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

39194

return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

39195

256);

39196

}

39197

39198

// If we're inserting the low subvector, an insert-subvector 'concat'

39199

// pattern is quicker than VPERM2X128.

39200

// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.

39201

if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

39202

!Subtarget.hasAVX2()) {

39203

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39204

return SDValue(); // Nothing to do!

39205

SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

39206

SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

39207

Hi = extractSubVector(Hi, 0, DAG, DL, 128);

39208

return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

39209

}

39210

39211

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

39212

return SDValue(); // Nothing to do!

39213

39214

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

39215

// we need to use the zeroing feature.

39216

// Prefer blends for sequential shuffles unless we are optimizing for size.

39217

if (UnaryShuffle &&

39218

!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

39219

(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

39220

unsigned PermMask = 0;

39221

PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

39222

PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

39223

return DAG.getNode(

39224

X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

39225

DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

39226

}

39227

39228

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39229

return SDValue(); // Nothing to do!

39230

39231

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

39232

if (!UnaryShuffle && !IsMaskedShuffle) {

39233

assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39234, __extension__
__PRETTY_FUNCTION__))

39234

"Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39234, __extension__
__PRETTY_FUNCTION__));

39235

// Prefer blends to X86ISD::VPERM2X128.

39236

if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

39237

unsigned PermMask = 0;

39238

PermMask |= ((Mask[0] & 3) << 0);

39239

PermMask |= ((Mask[1] & 3) << 4);

39240

SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

39241

SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

39242

return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

39243

CanonicalizeShuffleInput(RootVT, LHS),

39244

CanonicalizeShuffleInput(RootVT, RHS),

39245

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39246

}

39247

}

39248

}

39249

39250

// For masks that have been widened to 128-bit elements or more,

39251

// narrow back down to 64-bit elements.

39252

if (BaseMaskEltSizeInBits > 64) {

39253

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39253, __extension__
__PRETTY_FUNCTION__));

39254

int MaskScale = BaseMaskEltSizeInBits / 64;

39255

SmallVector<int, 64> ScaledMask;

39256

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39257

Mask = std::move(ScaledMask);

39258

}

39259

39260

// For masked shuffles, we're trying to match the root width for better

39261

// writemask folding, attempt to scale the mask.

39262

// TODO - variable shuffles might need this to be widened again.

39263

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

39264

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__
__PRETTY_FUNCTION__));

39265

int MaskScale = NumRootElts / Mask.size();

39266

SmallVector<int, 64> ScaledMask;

39267

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39268

Mask = std::move(ScaledMask);

39269

}

39270

39271

unsigned NumMaskElts = Mask.size();

39272

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

39273

39274

// Determine the effective mask value type.

39275

FloatDomain &= (32 <= MaskEltSizeInBits);

39276

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

39277

: MVT::getIntegerVT(MaskEltSizeInBits);

39278

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

39279

39280

// Only allow legal mask types.

39281

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

39282

return SDValue();

39283

39284

// Attempt to match the mask against known shuffle patterns.

39285

MVT ShuffleSrcVT, ShuffleVT;

39286

unsigned Shuffle, PermuteImm;

39287

39288

// Which shuffle domains are permitted?

39289

// Permit domain crossing at higher combine depths.

39290

// TODO: Should we indicate which domain is preferred if both are allowed?

39291

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

39292

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

39293

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

39294

39295

// Determine zeroable mask elements.

39296

APInt KnownUndef, KnownZero;

39297

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

39298

APInt Zeroable = KnownUndef | KnownZero;

39299

39300

if (UnaryShuffle) {

39301

// Attempt to match against broadcast-from-vector.

39302

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

39303

if ((Subtarget.hasAVX2() ||

39304

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

39305

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

39306

if (isUndefOrEqual(Mask, 0)) {

39307

if (V1.getValueType() == MaskVT &&

39308

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39309

X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

39310

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39311

return SDValue(); // Nothing to do!

39312

Res = V1.getOperand(0);

39313

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39314

return DAG.getBitcast(RootVT, Res);

39315

}

39316

if (Subtarget.hasAVX2()) {

39317

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39318

return SDValue(); // Nothing to do!

39319

Res = CanonicalizeShuffleInput(MaskVT, V1);

39320

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39321

return DAG.getBitcast(RootVT, Res);

39322

}

39323

}

39324

}

39325

39326

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

39327

DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

39328

(!IsMaskedShuffle ||

39329

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39330

if (Depth == 0 && Root.getOpcode() == Shuffle)

39331

return SDValue(); // Nothing to do!

39332

Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39333

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

39334

return DAG.getBitcast(RootVT, Res);

39335

}

39336

39337

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39338

AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,

39339

PermuteImm) &&

39340

(!IsMaskedShuffle ||

39341

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39342

if (Depth == 0 && Root.getOpcode() == Shuffle)

39343

return SDValue(); // Nothing to do!

39344

Res = CanonicalizeShuffleInput(ShuffleVT, V1);

39345

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

39346

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39347

return DAG.getBitcast(RootVT, Res);

39348

}

39349

}

39350

39351

// Attempt to combine to INSERTPS, but only if the inserted element has come

39352

// from a scalar.

39353

// TODO: Handle other insertions here as well?

39354

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

39355

Subtarget.hasSSE41() &&

39356

!isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {

39357

if (MaskEltSizeInBits == 32) {

39358

SDValue SrcV1 = V1, SrcV2 = V2;

39359

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

39360

DAG) &&

39361

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

39362

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39363

return SDValue(); // Nothing to do!

39364

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39365

CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

39366

CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

39367

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39368

return DAG.getBitcast(RootVT, Res);

39369

}

39370

}

39371

if (MaskEltSizeInBits == 64 &&

39372

isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&

39373

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39374

V2.getScalarValueSizeInBits() <= 32) {

39375

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39376

return SDValue(); // Nothing to do!

39377

PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);

39378

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39379

CanonicalizeShuffleInput(MVT::v4f32, V1),

39380

CanonicalizeShuffleInput(MVT::v4f32, V2),

39381

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39382

return DAG.getBitcast(RootVT, Res);

39383

}

39384

}

39385

39386

SDValue NewV1 = V1; // Save operands in case early exit happens.

39387

SDValue NewV2 = V2;

39388

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

39389

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

39390

ShuffleVT, UnaryShuffle) &&

39391

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39392

if (Depth == 0 && Root.getOpcode() == Shuffle)

39393

return SDValue(); // Nothing to do!

39394

NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

39395

NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

39396

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

39397

return DAG.getBitcast(RootVT, Res);

39398

}

39399

39400

NewV1 = V1; // Save operands in case early exit happens.

39401

NewV2 = V2;

39402

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39403

AllowIntDomain, NewV1, NewV2, DL, DAG,

39404

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

39405

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39406

if (Depth == 0 && Root.getOpcode() == Shuffle)

39407

return SDValue(); // Nothing to do!

39408

NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

39409

NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

39410

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

39411

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39412

return DAG.getBitcast(RootVT, Res);

39413

}

39414

39415

// Typically from here on, we need an integer version of MaskVT.

39416

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

39417

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

39418

39419

// Annoyingly, SSE4A instructions don't map into the above match helpers.

39420

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

39421

uint64_t BitLen, BitIdx;

39422

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

39423

Zeroable)) {

39424

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

39425

return SDValue(); // Nothing to do!

39426

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39427

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

39428

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39429

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39430

return DAG.getBitcast(RootVT, Res);

39431

}

39432

39433

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

39434

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

39435

return SDValue(); // Nothing to do!

39436

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39437

V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

39438

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

39439

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39440

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39441

return DAG.getBitcast(RootVT, Res);

39442

}

39443

}

39444

39445

// Match shuffle against TRUNCATE patterns.

39446

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

39447

// Match against a VTRUNC instruction, accounting for src/dst sizes.

39448

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

39449

Subtarget)) {

39450

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

39451

ShuffleSrcVT.getVectorNumElements();

39452

unsigned Opc =

39453

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

39454

if (Depth == 0 && Root.getOpcode() == Opc)

39455

return SDValue(); // Nothing to do!

39456

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39457

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

39458

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

39459

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

39460

return DAG.getBitcast(RootVT, Res);

39461

}

39462

39463

// Do we need a more general binary truncation pattern?

39464

if (RootSizeInBits < 512 &&

39465

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

39466

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

39467

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

39468

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

39469

// Bail if this was already a truncation or PACK node.

39470

// We sometimes fail to match PACK if we demand known undef elements.

39471

if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||

39472

Root.getOpcode() == X86ISD::PACKSS ||

39473

Root.getOpcode() == X86ISD::PACKUS))

39474

return SDValue(); // Nothing to do!

39475

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

39476

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

39477

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39478

V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

39479

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

39480

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

39481

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

39482

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

39483

return DAG.getBitcast(RootVT, Res);

39484

}

39485

}

39486

39487

// Don't try to re-form single instruction chains under any circumstances now

39488

// that we've done encoding canonicalization for them.

39489

if (Depth < 1)

39490

return SDValue();

39491

39492

// Depth threshold above which we can efficiently use variable mask shuffles.

39493

int VariableCrossLaneShuffleDepth =

39494

Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

39495

int VariablePerLaneShuffleDepth =

39496

Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

39497

AllowVariableCrossLaneMask &=

39498

(Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;

39499

AllowVariablePerLaneMask &=

39500

(Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;

39501

// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a

39502

// higher depth before combining them.

39503

bool AllowBWIVPERMV3 =

39504

(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);

39505

39506

bool MaskContainsZeros = isAnyZero(Mask);

39507

39508

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

39509

// If we have a single input lane-crossing shuffle then lower to VPERMV.

39510

if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

39511

if (Subtarget.hasAVX2() &&

39512

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

39513

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

39514

Res = CanonicalizeShuffleInput(MaskVT, V1);

39515

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

39516

return DAG.getBitcast(RootVT, Res);

39517

}

39518

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

39519

if ((Subtarget.hasAVX512() &&

39520

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

39521

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

39522

(Subtarget.hasBWI() &&

39523

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

39524

(Subtarget.hasVBMI() &&

39525

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

39526

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39527

V2 = DAG.getUNDEF(MaskVT);

39528

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39529

return DAG.getBitcast(RootVT, Res);

39530

}

39531

}

39532

39533

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

39534

// vector as the second source (non-VLX will pad to 512-bit shuffles).

39535

if (UnaryShuffle && AllowVariableCrossLaneMask &&

39536

((Subtarget.hasAVX512() &&

39537

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

39538

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

39539

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

39540

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

39541

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

39542

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

39543

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

39544

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

39545

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

39546

for (unsigned i = 0; i != NumMaskElts; ++i)

39547

if (Mask[i] == SM_SentinelZero)

39548

Mask[i] = NumMaskElts + i;

39549

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39550

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

39551

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39552

return DAG.getBitcast(RootVT, Res);

39553

}

39554

39555

// If that failed and either input is extracted then try to combine as a

39556

// shuffle with the larger type.

39557

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

39558

Inputs, Root, BaseMask, Depth, HasVariableMask,

39559

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,

39560

Subtarget))

39561

return WideShuffle;

39562

39563

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

39564

// (non-VLX will pad to 512-bit shuffles).

39565

if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

39566

((Subtarget.hasAVX512() &&

39567

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

39568

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

39569

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

39570

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

39571

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

39572

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

39573

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

39574

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

39575

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39576

V2 = CanonicalizeShuffleInput(MaskVT, V2);

39577

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39578

return DAG.getBitcast(RootVT, Res);

39579

}

39580

return SDValue();

39581

}

39582

39583

// See if we can combine a single input shuffle with zeros to a bit-mask,

39584

// which is much simpler than any shuffle.

39585

if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

39586

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

39587

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

39588

APInt Zero = APInt::getZero(MaskEltSizeInBits);

39589

APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

39590

APInt UndefElts(NumMaskElts, 0);

39591

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

39592

for (unsigned i = 0; i != NumMaskElts; ++i) {

39593

int M = Mask[i];

39594

if (M == SM_SentinelUndef) {

39595

UndefElts.setBit(i);

39596

continue;

39597

}

39598

if (M == SM_SentinelZero)

39599

continue;

39600

EltBits[i] = AllOnes;

39601

}

39602

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

39603

Res = CanonicalizeShuffleInput(MaskVT, V1);

39604

unsigned AndOpcode =

39605

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

39606

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

39607

return DAG.getBitcast(RootVT, Res);

39608

}

39609

39610

// If we have a single input shuffle with different shuffle patterns in the

39611

// the 128-bit lanes use the variable mask to VPERMILPS.

39612

// TODO Combine other mask types at higher depths.

39613

if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

39614

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

39615

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

39616

SmallVector<SDValue, 16> VPermIdx;

39617

for (int M : Mask) {

39618

SDValue Idx =

39619

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

39620

VPermIdx.push_back(Idx);

39621

}

39622

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

39623

Res = CanonicalizeShuffleInput(MaskVT, V1);

39624

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

39625

return DAG.getBitcast(RootVT, Res);

39626

}

39627

39628

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

39629

// to VPERMIL2PD/VPERMIL2PS.

39630

if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

39631

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

39632

MaskVT == MVT::v8f32)) {

39633

// VPERMIL2 Operation.

39634

// Bits[3] - Match Bit.

39635

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

39636

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

39637

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

39638

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

39639

SmallVector<int, 8> VPerm2Idx;

39640

unsigned M2ZImm = 0;

39641

for (int M : Mask) {

39642

if (M == SM_SentinelUndef) {

39643

VPerm2Idx.push_back(-1);

39644

continue;

39645

}

39646

if (M == SM_SentinelZero) {

39647

M2ZImm = 2;

39648

VPerm2Idx.push_back(8);

39649

continue;

39650

}

39651

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

39652

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

39653

VPerm2Idx.push_back(Index);

39654

}

39655

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39656

V2 = CanonicalizeShuffleInput(MaskVT, V2);

39657

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

39658

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

39659

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

39660

return DAG.getBitcast(RootVT, Res);

39661

}

39662

39663

// If we have 3 or more shuffle instructions or a chain involving a variable

39664

// mask, we can replace them with a single PSHUFB instruction profitably.

39665

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

39666

// instructions, but in practice PSHUFB tends to be *very* fast so we're

39667

// more aggressive.

39668

if (UnaryShuffle && AllowVariablePerLaneMask &&

39669

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

39670

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

39671

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

39672

SmallVector<SDValue, 16> PSHUFBMask;

39673

int NumBytes = RootVT.getSizeInBits() / 8;

39674

int Ratio = NumBytes / NumMaskElts;

39675

for (int i = 0; i < NumBytes; ++i) {

39676

int M = Mask[i / Ratio];

39677

if (M == SM_SentinelUndef) {

39678

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

39679

continue;

39680

}

39681

if (M == SM_SentinelZero) {

39682

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

39683

continue;

39684

}

39685

M = Ratio * M + i % Ratio;

39686

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39686, __extension__
__PRETTY_FUNCTION__));

39687

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

39688

}

39689

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

39690

Res = CanonicalizeShuffleInput(ByteVT, V1);

39691

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

39692

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

39693

return DAG.getBitcast(RootVT, Res);

39694

}

39695

39696

// With XOP, if we have a 128-bit binary input shuffle we can always combine

39697

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

39698

// slower than PSHUFB on targets that support both.

39699

if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

39700

Subtarget.hasXOP()) {

39701

// VPPERM Mask Operation

39702

// Bits[4:0] - Byte Index (0 - 31)

39703

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

39704

SmallVector<SDValue, 16> VPPERMMask;

39705

int NumBytes = 16;

39706

int Ratio = NumBytes / NumMaskElts;

39707

for (int i = 0; i < NumBytes; ++i) {

39708

int M = Mask[i / Ratio];

39709

if (M == SM_SentinelUndef) {

39710

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

39711

continue;

39712

}

39713

if (M == SM_SentinelZero) {

39714

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

39715

continue;

39716

}

39717

M = Ratio * M + i % Ratio;

39718

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

39719

}

39720

MVT ByteVT = MVT::v16i8;

39721

V1 = CanonicalizeShuffleInput(ByteVT, V1);

39722

V2 = CanonicalizeShuffleInput(ByteVT, V2);

39723

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

39724

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

39725

return DAG.getBitcast(RootVT, Res);

39726

}

39727

39728

// If that failed and either input is extracted then try to combine as a

39729

// shuffle with the larger type.

39730

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

39731

Inputs, Root, BaseMask, Depth, HasVariableMask,

39732

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))

39733

return WideShuffle;

39734

39735

// If we have a dual input shuffle then lower to VPERMV3,

39736

// (non-VLX will pad to 512-bit shuffles)

39737

if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

39738

((Subtarget.hasAVX512() &&

39739

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

39740

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

39741

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

39742

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

39743

MaskVT == MVT::v16i32)) ||

39744

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

39745

(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

39746

MaskVT == MVT::v32i16)) ||

39747

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

39748

(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

39749

MaskVT == MVT::v64i8)))) {

39750

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39751

V2 = CanonicalizeShuffleInput(MaskVT, V2);

39752

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39753

return DAG.getBitcast(RootVT, Res);

39754

}

39755

39756

// Failed to find any combines.

39757

return SDValue();

39758

}

39759

39760

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

39761

// instruction if possible.

39762

//

39763

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

39764

// type size to attempt to combine:

39765

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

39766

// -->

39767

// extract_subvector(shuffle(x,y,m2),0)

39768

static SDValue combineX86ShuffleChainWithExtract(

39769

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

39770

bool HasVariableMask, bool AllowVariableCrossLaneMask,

39771

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

39772

const X86Subtarget &Subtarget) {

39773

unsigned NumMaskElts = BaseMask.size();

39774

unsigned NumInputs = Inputs.size();

39775

if (NumInputs == 0)

39776

return SDValue();

39777

39778

EVT RootVT = Root.getValueType();

39779

unsigned RootSizeInBits = RootVT.getSizeInBits();

39780

unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;

39781

assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39781, __extension__
__PRETTY_FUNCTION__));

39782

39783

// Peek through extract_subvector to find widest legal vector.

39784

// TODO: Handle ISD::TRUNCATE

39785

unsigned WideSizeInBits = RootSizeInBits;

39786

for (unsigned I = 0; I != NumInputs; ++I) {

39787

SDValue Input = peekThroughBitcasts(Inputs[I]);

39788

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)

39789

Input = peekThroughBitcasts(Input.getOperand(0));

39790

if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&

39791

WideSizeInBits < Input.getValueSizeInBits())

39792

WideSizeInBits = Input.getValueSizeInBits();

39793

}

39794

39795

// Bail if we fail to find a source larger than the existing root.

39796

unsigned Scale = WideSizeInBits / RootSizeInBits;

39797

if (WideSizeInBits <= RootSizeInBits ||

39798

(WideSizeInBits % RootSizeInBits) != 0)

39799

return SDValue();

39800

39801

// Create new mask for larger type.

39802

SmallVector<int, 64> WideMask(BaseMask);

39803

for (int &M : WideMask) {

39804

if (M < 0)

39805

continue;

39806

M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);

39807

}

39808

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

39809

39810

// Attempt to peek through inputs and adjust mask when we extract from an

39811

// upper subvector.

39812

int AdjustedMasks = 0;

39813

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

39814

for (unsigned I = 0; I != NumInputs; ++I) {

39815

SDValue &Input = WideInputs[I];

39816

Input = peekThroughBitcasts(Input);

39817

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

39818

Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {

39819

uint64_t Idx = Input.getConstantOperandVal(1);

39820

if (Idx != 0) {

39821

++AdjustedMasks;

39822

unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();

39823

Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;

39824

39825

int lo = I * WideMask.size();

39826

int hi = (I + 1) * WideMask.size();

39827

for (int &M : WideMask)

39828

if (lo <= M && M < hi)

39829

M += Idx;

39830

}

39831

Input = peekThroughBitcasts(Input.getOperand(0));

39832

}

39833

}

39834

39835

// Remove unused/repeated shuffle source ops.

39836

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

39837

assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39837, __extension__
__PRETTY_FUNCTION__));

39838

39839

// Bail if we're always extracting from the lowest subvectors,

39840

// combineX86ShuffleChain should match this for the current width, or the

39841

// shuffle still references too many inputs.

39842

if (AdjustedMasks == 0 || WideInputs.size() > 2)

39843

return SDValue();

39844

39845

// Minor canonicalization of the accumulated shuffle mask to make it easier

39846

// to match below. All this does is detect masks with sequential pairs of

39847

// elements, and shrink them to the half-width mask. It does this in a loop

39848

// so it will reduce the size of the mask to the minimal width mask which

39849

// performs an equivalent shuffle.

39850

while (WideMask.size() > 1) {

39851

SmallVector<int, 64> WidenedMask;

39852

if (!canWidenShuffleElements(WideMask, WidenedMask))

39853

break;

39854

WideMask = std::move(WidenedMask);

39855

}

39856

39857

// Canonicalization of binary shuffle masks to improve pattern matching by

39858

// commuting the inputs.

39859

if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {

39860

ShuffleVectorSDNode::commuteMask(WideMask);

39861

std::swap(WideInputs[0], WideInputs[1]);

39862

}

39863

39864

// Increase depth for every upper subvector we've peeked through.

39865

Depth += AdjustedMasks;

39866

39867

// Attempt to combine wider chain.

39868

// TODO: Can we use a better Root?

39869

SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

39870

WideInputs.back().getValueSizeInBits()

39871

? WideInputs.front()

39872

: WideInputs.back();

39873

assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39874, __extension__
__PRETTY_FUNCTION__))

39874

"WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39874, __extension__
__PRETTY_FUNCTION__));

39875

39876

if (SDValue WideShuffle =

39877

combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,

39878

HasVariableMask, AllowVariableCrossLaneMask,

39879

AllowVariablePerLaneMask, DAG, Subtarget)) {

39880

WideShuffle =

39881

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

39882

return DAG.getBitcast(RootVT, WideShuffle);

39883

}

39884

39885

return SDValue();

39886

}

39887

39888

// Canonicalize the combined shuffle mask chain with horizontal ops.

39889

// NOTE: This may update the Ops and Mask.

39890

static SDValue canonicalizeShuffleMaskWithHorizOp(

39891

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

39892

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

39893

const X86Subtarget &Subtarget) {

39894

if (Mask.empty() || Ops.empty())

39895

return SDValue();

39896

39897

SmallVector<SDValue> BC;

39898

for (SDValue Op : Ops)

39899

BC.push_back(peekThroughBitcasts(Op));

39900

39901

// All ops must be the same horizop + type.

39902

SDValue BC0 = BC[0];

39903

EVT VT0 = BC0.getValueType();

39904

unsigned Opcode0 = BC0.getOpcode();

39905

if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

39906

return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

39907

}))

39908

return SDValue();

39909

39910

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

39911

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

39912

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

39913

if (!isHoriz && !isPack)

39914

return SDValue();

39915

39916

// Do all ops have a single use?

39917

bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

39918

return Op.hasOneUse() &&

39919

peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

39920

});

39921

39922

int NumElts = VT0.getVectorNumElements();

39923

int NumLanes = VT0.getSizeInBits() / 128;

39924

int NumEltsPerLane = NumElts / NumLanes;

39925

int NumHalfEltsPerLane = NumEltsPerLane / 2;

39926

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

39927

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

39928

39929

if (NumEltsPerLane >= 4 &&

39930

(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

39931

SmallVector<int> LaneMask, ScaledMask;

39932

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

39933

scaleShuffleElements(LaneMask, 4, ScaledMask)) {

39934

// See if we can remove the shuffle by resorting the HOP chain so that

39935

// the HOP args are pre-shuffled.

39936

// TODO: Generalize to any sized/depth chain.

39937

// TODO: Add support for PACKSS/PACKUS.

39938

if (isHoriz) {

39939

// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

39940

auto GetHOpSrc = [&](int M) {

39941

if (M == SM_SentinelUndef)

39942

return DAG.getUNDEF(VT0);

39943

if (M == SM_SentinelZero)

39944

return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

39945

SDValue Src0 = BC[M / 4];

39946

SDValue Src1 = Src0.getOperand((M % 4) >= 2);

39947

if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

39948

return Src1.getOperand(M % 2);

39949

return SDValue();

39950

};

39951

SDValue M0 = GetHOpSrc(ScaledMask[0]);

39952

SDValue M1 = GetHOpSrc(ScaledMask[1]);

39953

SDValue M2 = GetHOpSrc(ScaledMask[2]);

39954

SDValue M3 = GetHOpSrc(ScaledMask[3]);

39955

if (M0 && M1 && M2 && M3) {

39956

SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

39957

SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

39958

return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

39959

}

39960

}

39961

// shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

39962

if (Ops.size() >= 2) {

39963

SDValue LHS, RHS;

39964

auto GetHOpSrc = [&](int M, int &OutM) {

39965

// TODO: Support SM_SentinelZero

39966

if (M < 0)

39967

return M == SM_SentinelUndef;

39968

SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

39969

if (!LHS || LHS == Src) {

39970

LHS = Src;

39971

OutM = (M % 2);

39972

return true;

39973

}

39974

if (!RHS || RHS == Src) {

39975

RHS = Src;

39976

OutM = (M % 2) + 2;

39977

return true;

39978

}

39979

return false;

39980

};

39981

int PostMask[4] = {-1, -1, -1, -1};

39982

if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

39983

GetHOpSrc(ScaledMask[1], PostMask[1]) &&

39984

GetHOpSrc(ScaledMask[2], PostMask[2]) &&

39985

GetHOpSrc(ScaledMask[3], PostMask[3])) {

39986

LHS = DAG.getBitcast(SrcVT, LHS);

39987

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

39988

SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

39989

// Use SHUFPS for the permute so this will work on SSE3 targets,

39990

// shuffle combining and domain handling will simplify this later on.

39991

MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

39992

Res = DAG.getBitcast(ShuffleVT, Res);

39993

return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

39994

getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

39995

}

39996

}

39997

}

39998

}

39999

40000

if (2 < Ops.size())

40001

return SDValue();

40002

40003

SDValue BC1 = BC[BC.size() - 1];

40004

if (Mask.size() == VT0.getVectorNumElements()) {

40005

// Canonicalize binary shuffles of horizontal ops that use the

40006

// same sources to an unary shuffle.

40007

// TODO: Try to perform this fold even if the shuffle remains.

40008

if (Ops.size() == 2) {

40009

auto ContainsOps = [](SDValue HOp, SDValue Op) {

40010

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

40011

};

40012

// Commute if all BC0's ops are contained in BC1.

40013

if (ContainsOps(BC1, BC0.getOperand(0)) &&

40014

ContainsOps(BC1, BC0.getOperand(1))) {

40015

ShuffleVectorSDNode::commuteMask(Mask);

40016

std::swap(Ops[0], Ops[1]);

40017

std::swap(BC0, BC1);

40018

}

40019

40020

// If BC1 can be represented by BC0, then convert to unary shuffle.

40021

if (ContainsOps(BC0, BC1.getOperand(0)) &&

40022

ContainsOps(BC0, BC1.getOperand(1))) {

40023

for (int &M : Mask) {

40024

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

40025

continue;

40026

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

40027

M -= NumElts + (SubLane * NumHalfEltsPerLane);

40028

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

40029

M += NumHalfEltsPerLane;

40030

}

40031

}

40032

}

40033

40034

// Canonicalize unary horizontal ops to only refer to lower halves.

40035

for (int i = 0; i != NumElts; ++i) {

40036

int &M = Mask[i];

40037

if (isUndefOrZero(M))

40038

continue;

40039

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

40040

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40041

M -= NumHalfEltsPerLane;

40042

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

40043

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40044

M -= NumHalfEltsPerLane;

40045

}

40046

}

40047

40048

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

40049

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

40050

// represents the LHS/RHS inputs for the lower/upper halves.

40051

SmallVector<int, 16> TargetMask128, WideMask128;

40052

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

40053

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

40054

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40054, __extension__
__PRETTY_FUNCTION__));

40055

bool SingleOp = (Ops.size() == 1);

40056

if (isPack || OneUseOps ||

40057

shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

40058

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

40059

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

40060

Lo = Lo.getOperand(WideMask128[0] & 1);

40061

Hi = Hi.getOperand(WideMask128[1] & 1);

40062

if (SingleOp) {

40063

SDValue Undef = DAG.getUNDEF(SrcVT);

40064

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

40065

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

40066

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

40067

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

40068

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

40069

}

40070

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

40071

}

40072

}

40073

40074

return SDValue();

40075

}

40076

40077

// Attempt to constant fold all of the constant source ops.

40078

// Returns true if the entire shuffle is folded to a constant.

40079

// TODO: Extend this to merge multiple constant Ops and update the mask.

40080

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

40081

ArrayRef<int> Mask, SDValue Root,

40082

bool HasVariableMask,

40083

SelectionDAG &DAG,

40084

const X86Subtarget &Subtarget) {

40085

MVT VT = Root.getSimpleValueType();

40086

40087

unsigned SizeInBits = VT.getSizeInBits();

40088

unsigned NumMaskElts = Mask.size();

40089

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

40090

unsigned NumOps = Ops.size();

40091

40092

// Extract constant bits from each source op.

40093

bool OneUseConstantOp = false;

40094

SmallVector<APInt, 16> UndefEltsOps(NumOps);

40095

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

40096

for (unsigned i = 0; i != NumOps; ++i) {

40097

SDValue SrcOp = Ops[i];

40098

OneUseConstantOp |= SrcOp.hasOneUse();

40099

if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],

40100

RawBitsOps[i]))

40101

return SDValue();

40102

}

40103

40104

// If we're optimizing for size, only fold if at least one of the constants is

40105

// only used once or the combined shuffle has included a variable mask

40106

// shuffle, this is to avoid constant pool bloat.

40107

bool IsOptimizingSize = DAG.shouldOptForSize();

40108

if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)

40109

return SDValue();

40110

40111

// Shuffle the constant bits according to the mask.

40112

SDLoc DL(Root);

40113

APInt UndefElts(NumMaskElts, 0);

40114

APInt ZeroElts(NumMaskElts, 0);

40115

APInt ConstantElts(NumMaskElts, 0);

40116

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

40117

APInt::getZero(MaskSizeInBits));

40118

for (unsigned i = 0; i != NumMaskElts; ++i) {

40119

int M = Mask[i];

40120

if (M == SM_SentinelUndef) {

40121

UndefElts.setBit(i);

40122

continue;

40123

} else if (M == SM_SentinelZero) {

40124

ZeroElts.setBit(i);

40125

continue;

40126

}

40127

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40127, __extension__
__PRETTY_FUNCTION__));

40128

40129

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

40130

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

40131

40132

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

40133

if (SrcUndefElts[SrcMaskIdx]) {

40134

UndefElts.setBit(i);

40135

continue;

40136

}

40137

40138

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

40139

APInt &Bits = SrcEltBits[SrcMaskIdx];

40140

if (!Bits) {

40141

ZeroElts.setBit(i);

40142

continue;

40143

}

40144

40145

ConstantElts.setBit(i);

40146

ConstantBitData[i] = Bits;

40147

}

40148

40149

40150

// Attempt to create a zero vector.

40151

if ((UndefElts | ZeroElts).isAllOnes())

40152

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

40153

40154

// Create the constant data.

40155

MVT MaskSVT;

40156

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

40157

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

40158

else

40159

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

40160

40161

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

40162

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

40163

return SDValue();

40164

40165

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

40166

return DAG.getBitcast(VT, CstOp);

40167

}

40168

40169

namespace llvm {

40170

namespace X86 {

40171

enum {

40172

MaxShuffleCombineDepth = 8

40173

};

40174

}

40175

} // namespace llvm

40176

40177

/// Fully generic combining of x86 shuffle instructions.

40178

///

40179

/// This should be the last combine run over the x86 shuffle instructions. Once

40180

/// they have been fully optimized, this will recursively consider all chains

40181

/// of single-use shuffle instructions, build a generic model of the cumulative

40182

/// shuffle operation, and check for simpler instructions which implement this

40183

/// operation. We use this primarily for two purposes:

40184

///

40185

/// 1) Collapse generic shuffles to specialized single instructions when

40186

/// equivalent. In most cases, this is just an encoding size win, but

40187

/// sometimes we will collapse multiple generic shuffles into a single

40188

/// special-purpose shuffle.

40189

/// 2) Look for sequences of shuffle instructions with 3 or more total

40190

/// instructions, and replace them with the slightly more expensive SSSE3

40191

/// PSHUFB instruction if available. We do this as the last combining step

40192

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

40193

/// a suitable short sequence of other instructions. The PSHUFB will either

40194

/// use a register or have to read from memory and so is slightly (but only

40195

/// slightly) more expensive than the other shuffle instructions.

40196

///

40197

/// Because this is inherently a quadratic operation (for each shuffle in

40198

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

40199

/// This should never be an issue in practice as the shuffle lowering doesn't

40200

/// produce sequences of more than 8 instructions.

40201

///

40202

/// FIXME: We will currently miss some cases where the redundant shuffling

40203

/// would simplify under the threshold for PSHUFB formation because of

40204

/// combine-ordering. To fix this, we should do the redundant instruction

40205

/// combining in this recursive walk.

40206

static SDValue combineX86ShufflesRecursively(

40207

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

40208

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

40209

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,

40210

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40211

const X86Subtarget &Subtarget) {

40212

assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40214, __extension__
__PRETTY_FUNCTION__))

40213

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40214, __extension__
__PRETTY_FUNCTION__))

40214

"Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40214, __extension__
__PRETTY_FUNCTION__));

40215

MVT RootVT = Root.getSimpleValueType();

40216

assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40216, __extension__
__PRETTY_FUNCTION__));

40217

unsigned RootSizeInBits = RootVT.getSizeInBits();

40218

40219

// Bound the depth of our recursive combine because this is ultimately

40220

// quadratic in nature.

40221

if (Depth >= MaxDepth)

40222

return SDValue();

40223

40224

// Directly rip through bitcasts to find the underlying operand.

40225

SDValue Op = SrcOps[SrcOpIndex];

40226

Op = peekThroughOneUseBitcasts(Op);

40227

40228

EVT VT = Op.getValueType();

40229

if (!VT.isVector() || !VT.isSimple())

40230

return SDValue(); // Bail if we hit a non-simple non-vector.

40231

40232

// FIXME: Just bail on f16 for now.

40233

if (VT.getVectorElementType() == MVT::f16)

40234

return SDValue();

40235

40236

assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40237, __extension__
__PRETTY_FUNCTION__))

40237

"Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40237, __extension__
__PRETTY_FUNCTION__));

40238

40239

// Create a demanded elts mask from the referenced elements of Op.

40240

APInt OpDemandedElts = APInt::getZero(RootMask.size());

40241

for (int M : RootMask) {

40242

int BaseIdx = RootMask.size() * SrcOpIndex;

40243

if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))

40244

OpDemandedElts.setBit(M - BaseIdx);

40245

}

40246

if (RootSizeInBits != VT.getSizeInBits()) {

40247

// Op is smaller than Root - extract the demanded elts for the subvector.

40248

unsigned Scale = RootSizeInBits / VT.getSizeInBits();

40249

unsigned NumOpMaskElts = RootMask.size() / Scale;

40250

assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40250, __extension__
__PRETTY_FUNCTION__));

40251

assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40254, __extension__
__PRETTY_FUNCTION__))

40252

.extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40254, __extension__
__PRETTY_FUNCTION__))

40253

.isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40254, __extension__
__PRETTY_FUNCTION__))

40254

"Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40254, __extension__
__PRETTY_FUNCTION__));

40255

OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);

40256

}

40257

OpDemandedElts =

40258

APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());

40259

40260

// Extract target shuffle mask and resolve sentinels and inputs.

40261

SmallVector<int, 64> OpMask;

40262

SmallVector<SDValue, 2> OpInputs;

40263

APInt OpUndef, OpZero;

40264

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

40265

if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

40266

OpZero, DAG, Depth, false)) {

40267

// Shuffle inputs must not be larger than the shuffle result.

40268

// TODO: Relax this for single input faux shuffles (e.g. trunc).

40269

if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

40270

return OpInput.getValueSizeInBits() > VT.getSizeInBits();

40271

}))

40272

return SDValue();

40273

} else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40274

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

40275

!isNullConstant(Op.getOperand(1))) {

40276

SDValue SrcVec = Op.getOperand(0);

40277

int ExtractIdx = Op.getConstantOperandVal(1);

40278

unsigned NumElts = VT.getVectorNumElements();

40279

OpInputs.assign({SrcVec});

40280

OpMask.assign(NumElts, SM_SentinelUndef);

40281

std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

40282

OpZero = OpUndef = APInt::getZero(NumElts);

40283

} else {

40284

return SDValue();

40285

}

40286

40287

// If the shuffle result was smaller than the root, we need to adjust the

40288

// mask indices and pad the mask with undefs.

40289

if (RootSizeInBits > VT.getSizeInBits()) {

40290

unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

40291

unsigned OpMaskSize = OpMask.size();

40292

if (OpInputs.size() > 1) {

40293

unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

40294

for (int &M : OpMask) {

40295

if (M < 0)

40296

continue;

40297

int EltIdx = M % OpMaskSize;

40298

int OpIdx = M / OpMaskSize;

40299

M = (PaddedMaskSize * OpIdx) + EltIdx;

40300

}

40301

}

40302

OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

40303

OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

40304

OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

40305

}

40306

40307

SmallVector<int, 64> Mask;

40308

SmallVector<SDValue, 16> Ops;

40309

40310

// We don't need to merge masks if the root is empty.

40311

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

40312

if (EmptyRoot) {

40313

// Only resolve zeros if it will remove an input, otherwise we might end

40314

// up in an infinite loop.

40315

bool ResolveKnownZeros = true;

40316

if (!OpZero.isZero()) {

40317

APInt UsedInputs = APInt::getZero(OpInputs.size());

40318

for (int i = 0, e = OpMask.size(); i != e; ++i) {

40319

int M = OpMask[i];

40320

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

40321

continue;

40322

UsedInputs.setBit(M / OpMask.size());

40323

if (UsedInputs.isAllOnes()) {

40324

ResolveKnownZeros = false;

40325

break;

40326

}

40327

}

40328

}

40329

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

40330

ResolveKnownZeros);

40331

40332

Mask = OpMask;

40333

Ops.append(OpInputs.begin(), OpInputs.end());

40334

} else {

40335

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

40336

40337

// Add the inputs to the Ops list, avoiding duplicates.

40338

Ops.append(SrcOps.begin(), SrcOps.end());

40339

40340

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

40341

// Attempt to find an existing match.

40342

SDValue InputBC = peekThroughBitcasts(Input);

40343

for (int i = 0, e = Ops.size(); i < e; ++i)

40344

if (InputBC == peekThroughBitcasts(Ops[i]))

40345

return i;

40346

// Match failed - should we replace an existing Op?

40347

if (InsertionPoint >= 0) {

40348

Ops[InsertionPoint] = Input;

40349

return InsertionPoint;

40350

}

40351

// Add to the end of the Ops list.

40352

Ops.push_back(Input);

40353

return Ops.size() - 1;

40354

};

40355

40356

SmallVector<int, 2> OpInputIdx;

40357

for (SDValue OpInput : OpInputs)

40358

OpInputIdx.push_back(

40359

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

40360

40361

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40366, __extension__
__PRETTY_FUNCTION__))

40362

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40366, __extension__
__PRETTY_FUNCTION__))

40363

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40366, __extension__
__PRETTY_FUNCTION__))

40364

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40366, __extension__
__PRETTY_FUNCTION__))

40365

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40366, __extension__
__PRETTY_FUNCTION__))

40366

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40366, __extension__
__PRETTY_FUNCTION__));

40367

40368

// This function can be performance-critical, so we rely on the power-of-2

40369

// knowledge that we have about the mask sizes to replace div/rem ops with

40370

// bit-masks and shifts.

40371

assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40372, __extension__
__PRETTY_FUNCTION__))

40372

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40372, __extension__
__PRETTY_FUNCTION__));

40373

assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40374, __extension__
__PRETTY_FUNCTION__))

40374

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40374, __extension__
__PRETTY_FUNCTION__));

40375

unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());

40376

unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());

40377

40378

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

40379

unsigned RootRatio =

40380

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

40381

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

40382

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40383, __extension__
__PRETTY_FUNCTION__))

40383

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40383, __extension__
__PRETTY_FUNCTION__));

40384

40385

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40385, __extension__
__PRETTY_FUNCTION__));

40386

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40386, __extension__
__PRETTY_FUNCTION__));

40387

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40387, __extension__
__PRETTY_FUNCTION__));

40388

unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);

40389

unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);

40390

40391

Mask.resize(MaskWidth, SM_SentinelUndef);

40392

40393

// Merge this shuffle operation's mask into our accumulated mask. Note that

40394

// this shuffle's mask will be the first applied to the input, followed by

40395

// the root mask to get us all the way to the root value arrangement. The

40396

// reason for this order is that we are recursing up the operation chain.

40397

for (unsigned i = 0; i < MaskWidth; ++i) {

40398

unsigned RootIdx = i >> RootRatioLog2;

40399

if (RootMask[RootIdx] < 0) {

40400

// This is a zero or undef lane, we're done.

40401

Mask[i] = RootMask[RootIdx];

40402

continue;

40403

}

40404

40405

unsigned RootMaskedIdx =

40406

RootRatio == 1

40407

? RootMask[RootIdx]

40408

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

40409

40410

// Just insert the scaled root mask value if it references an input other

40411

// than the SrcOp we're currently inserting.

40412

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

40413

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

40414

Mask[i] = RootMaskedIdx;

40415

continue;

40416

}

40417

40418

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

40419

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

40420

if (OpMask[OpIdx] < 0) {

40421

// The incoming lanes are zero or undef, it doesn't matter which ones we

40422

// are using.

40423

Mask[i] = OpMask[OpIdx];

40424

continue;

40425

}

40426

40427

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

40428

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

40429

: (OpMask[OpIdx] << OpRatioLog2) +

40430

(RootMaskedIdx & (OpRatio - 1));

40431

40432

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

40433

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

40434

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40434, __extension__
__PRETTY_FUNCTION__));

40435

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

40436

40437

Mask[i] = OpMaskedIdx;

40438

}

40439

}

40440

40441

// Peek through vector widenings and set out of bounds mask indices to undef.

40442

// TODO: Can resolveTargetShuffleInputsAndMask do some of this?

40443

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

40444

SDValue &Op = Ops[I];

40445

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&

40446

isNullConstant(Op.getOperand(2))) {

40447

Op = Op.getOperand(1);

40448

unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();

40449

int Lo = I * Mask.size();

40450

int Hi = (I + 1) * Mask.size();

40451

int NewHi = Lo + (Mask.size() / Scale);

40452

for (int &M : Mask) {

40453

if (Lo <= M && NewHi <= M && M < Hi)

40454

M = SM_SentinelUndef;

40455

}

40456

}

40457

}

40458

40459

// Peek through any free extract_subvector nodes back to root size.

40460

for (SDValue &Op : Ops)

40461

while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40462

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

40463

isNullConstant(Op.getOperand(1)))

40464

Op = Op.getOperand(0);

40465

40466

// Remove unused/repeated shuffle source ops.

40467

resolveTargetShuffleInputsAndMask(Ops, Mask);

40468

40469

// Handle the all undef/zero/ones cases early.

40470

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

40471

return DAG.getUNDEF(RootVT);

40472

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

40473

return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));

40474

if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

40475

!llvm::is_contained(Mask, SM_SentinelZero))

40476

return getOnesVector(RootVT, DAG, SDLoc(Root));

40477

40478

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40478, __extension__
__PRETTY_FUNCTION__));

40479

HasVariableMask |= IsOpVariableMask;

40480

40481

// Update the list of shuffle nodes that have been combined so far.

40482

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

40483

SrcNodes.end());

40484

CombinedNodes.push_back(Op.getNode());

40485

40486

// See if we can recurse into each shuffle source op (if it's a target

40487

// shuffle). The source op should only be generally combined if it either has

40488

// a single use (i.e. current Op) or all its users have already been combined,

40489

// if not then we can still combine but should prevent generation of variable

40490

// shuffles to avoid constant pool bloat.

40491

// Don't recurse if we already have more source ops than we can combine in

40492

// the remaining recursion depth.

40493

if (Ops.size() < (MaxDepth - Depth)) {

40494

for (int i = 0, e = Ops.size(); i < e; ++i) {

40495

// For empty roots, we need to resolve zeroable elements before combining

40496

// them with other shuffles.

40497

SmallVector<int, 64> ResolvedMask = Mask;

40498

if (EmptyRoot)

40499

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

40500

bool AllowCrossLaneVar = false;

40501

bool AllowPerLaneVar = false;

40502

if (Ops[i].getNode()->hasOneUse() ||

40503

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

40504

AllowCrossLaneVar = AllowVariableCrossLaneMask;

40505

AllowPerLaneVar = AllowVariablePerLaneMask;

40506

}

40507

if (SDValue Res = combineX86ShufflesRecursively(

40508

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

40509

HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,

40510

Subtarget))

40511

return Res;

40512

}

40513

}

40514

40515

// Attempt to constant fold all of the constant source ops.

40516

if (SDValue Cst = combineX86ShufflesConstants(

40517

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

40518

return Cst;

40519

40520

// If constant fold failed and we only have constants - then we have

40521

// multiple uses by a single non-variable shuffle - just bail.

40522

if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

40523

APInt UndefElts;

40524

SmallVector<APInt> RawBits;

40525

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

40526

return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

40527

RawBits);

40528

})) {

40529

return SDValue();

40530

}

40531

40532

// Canonicalize the combined shuffle mask chain with horizontal ops.

40533

// NOTE: This will update the Ops and Mask.

40534

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

40535

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

40536

return DAG.getBitcast(RootVT, HOp);

40537

40538

// Try to refine our inputs given our knowledge of target shuffle mask.

40539

for (auto I : enumerate(Ops)) {

40540

int OpIdx = I.index();

40541

SDValue &Op = I.value();

40542

40543

// What range of shuffle mask element values results in picking from Op?

40544

int Lo = OpIdx * Mask.size();

40545

int Hi = Lo + Mask.size();

40546

40547

// Which elements of Op do we demand, given the mask's granularity?

40548

APInt OpDemandedElts(Mask.size(), 0);

40549

for (int MaskElt : Mask) {

40550

if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

40551

int OpEltIdx = MaskElt - Lo;

40552

OpDemandedElts.setBit(OpEltIdx);

40553

}

40554

}

40555

40556

// Is the shuffle result smaller than the root?

40557

if (Op.getValueSizeInBits() < RootSizeInBits) {

40558

// We padded the mask with undefs. But we now need to undo that.

40559

unsigned NumExpectedVectorElts = Mask.size();

40560

unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

40561

unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

40562

assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40564, __extension__
__PRETTY_FUNCTION__))

40563

NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40564, __extension__
__PRETTY_FUNCTION__))

40564

"Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40564, __extension__
__PRETTY_FUNCTION__));

40565

OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

40566

}

40567

40568

// The Op itself may be of different VT, so we need to scale the mask.

40569

unsigned NumOpElts = Op.getValueType().getVectorNumElements();

40570

APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);

40571

40572

// Can this operand be simplified any further, given it's demanded elements?

40573

if (SDValue NewOp =

40574

DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(

40575

Op, OpScaledDemandedElts, DAG))

40576

Op = NewOp;

40577

}

40578

// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?

40579

40580

// Widen any subvector shuffle inputs we've collected.

40581

// TODO: Remove this to avoid generating temporary nodes, we should only

40582

// widen once combineX86ShuffleChain has found a match.

40583

if (any_of(Ops, [RootSizeInBits](SDValue Op) {

40584

return Op.getValueSizeInBits() < RootSizeInBits;

40585

})) {

40586

for (SDValue &Op : Ops)

40587

if (Op.getValueSizeInBits() < RootSizeInBits)

40588

Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

40589

RootSizeInBits);

40590

// Reresolve - we might have repeated subvector sources.

40591

resolveTargetShuffleInputsAndMask(Ops, Mask);

40592

}

40593

40594

// We can only combine unary and binary shuffle mask cases.

40595

if (Ops.size() <= 2) {

40596

// Minor canonicalization of the accumulated shuffle mask to make it easier

40597

// to match below. All this does is detect masks with sequential pairs of

40598

// elements, and shrink them to the half-width mask. It does this in a loop

40599

// so it will reduce the size of the mask to the minimal width mask which

40600

// performs an equivalent shuffle.

40601

while (Mask.size() > 1) {

40602

SmallVector<int, 64> WidenedMask;

40603

if (!canWidenShuffleElements(Mask, WidenedMask))

40604

break;

40605

Mask = std::move(WidenedMask);

40606

}

40607

40608

// Canonicalization of binary shuffle masks to improve pattern matching by

40609

// commuting the inputs.

40610

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

40611

ShuffleVectorSDNode::commuteMask(Mask);

40612

std::swap(Ops[0], Ops[1]);

40613

}

40614

40615

// Try to combine into a single shuffle instruction.

40616

if (SDValue Shuffle = combineX86ShuffleChain(

40617

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

40618

AllowVariablePerLaneMask, DAG, Subtarget))

40619

return Shuffle;

40620

40621

// If all the operands come from the same larger vector, fallthrough and try

40622

// to use combineX86ShuffleChainWithExtract.

40623

SDValue LHS = peekThroughBitcasts(Ops.front());

40624

SDValue RHS = peekThroughBitcasts(Ops.back());

40625

if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||

40626

(RootSizeInBits / Mask.size()) != 64 ||

40627

LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

40628

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

40629

LHS.getOperand(0) != RHS.getOperand(0))

40630

return SDValue();

40631

}

40632

40633

// If that failed and any input is extracted then try to combine as a

40634

// shuffle with the larger type.

40635

return combineX86ShuffleChainWithExtract(

40636

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

40637

AllowVariablePerLaneMask, DAG, Subtarget);

40638

}

40639

40640

/// Helper entry wrapper to combineX86ShufflesRecursively.

40641

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

40642

const X86Subtarget &Subtarget) {

40643

return combineX86ShufflesRecursively(

40644

{Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,

40645

/*HasVarMask*/ false,

40646

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,

40647

Subtarget);

40648

}

40649

40650

/// Get the PSHUF-style mask from PSHUF node.

40651

///

40652

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

40653

/// PSHUF-style masks that can be reused with such instructions.

40654

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

40655

MVT VT = N.getSimpleValueType();

40656

SmallVector<int, 4> Mask;

40657

SmallVector<SDValue, 2> Ops;

40658

bool HaveMask =

40659

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);

40660

(void)HaveMask;

40661

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 40661
, __extension__ __PRETTY_FUNCTION__));

40662

40663

// If we have more than 128-bits, only the low 128-bits of shuffle mask

40664

// matter. Check that the upper masks are repeats and remove them.

40665

if (VT.getSizeInBits() > 128) {

40666

int LaneElts = 128 / VT.getScalarSizeInBits();

40667

#ifndef NDEBUG

40668

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

40669

for (int j = 0; j < LaneElts; ++j)

40670

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40671, __extension__
__PRETTY_FUNCTION__))

40671

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40671, __extension__
__PRETTY_FUNCTION__));

40672

#endif

40673

Mask.resize(LaneElts);

40674

}

40675

40676

switch (N.getOpcode()) {

40677

case X86ISD::PSHUFD:

40678

return Mask;

40679

case X86ISD::PSHUFLW:

40680

Mask.resize(4);

40681

return Mask;

40682

case X86ISD::PSHUFHW:

40683

Mask.erase(Mask.begin(), Mask.begin() + 4);

40684

for (int &M : Mask)

40685

M -= 4;

40686

return Mask;

40687

default:

40688

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40688);

40689

}

40690

}

40691

40692

/// Search for a combinable shuffle across a chain ending in pshufd.

40693

///

40694

/// We walk up the chain and look for a combinable shuffle, skipping over

40695

/// shuffles that we could hoist this shuffle's transformation past without

40696

/// altering anything.

40697

static SDValue

40698

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

40699

SelectionDAG &DAG) {

40700

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40701, __extension__
__PRETTY_FUNCTION__))

40701

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40701, __extension__
__PRETTY_FUNCTION__));

40702

SDLoc DL(N);

40703

40704

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

40705

// of the shuffles in the chain so that we can form a fresh chain to replace

40706

// this one.

40707

SmallVector<SDValue, 8> Chain;

40708

SDValue V = N.getOperand(0);

40709

for (; V.hasOneUse(); V = V.getOperand(0)) {

40710

switch (V.getOpcode()) {

40711

default:

40712

return SDValue(); // Nothing combined!

40713

40714

case ISD::BITCAST:

40715

// Skip bitcasts as we always know the type for the target specific

40716

// instructions.

40717

continue;

40718

40719

case X86ISD::PSHUFD:

40720

// Found another dword shuffle.

40721

break;

40722

40723

case X86ISD::PSHUFLW:

40724

// Check that the low words (being shuffled) are the identity in the

40725

// dword shuffle, and the high words are self-contained.

40726

if (Mask[0] != 0 || Mask[1] != 1 ||

40727

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

40728

return SDValue();

40729

40730

Chain.push_back(V);

40731

continue;

40732

40733

case X86ISD::PSHUFHW:

40734

// Check that the high words (being shuffled) are the identity in the

40735

// dword shuffle, and the low words are self-contained.

40736

if (Mask[2] != 2 || Mask[3] != 3 ||

40737

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

40738

return SDValue();

40739

40740

Chain.push_back(V);

40741

continue;

40742

40743

case X86ISD::UNPCKL:

40744

case X86ISD::UNPCKH:

40745

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

40746

// shuffle into a preceding word shuffle.

40747

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

40748

V.getSimpleValueType().getVectorElementType() != MVT::i16)

40749

return SDValue();

40750

40751

// Search for a half-shuffle which we can combine with.

40752

unsigned CombineOp =

40753

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

40754

if (V.getOperand(0) != V.getOperand(1) ||

40755

!V->isOnlyUserOf(V.getOperand(0).getNode()))

40756

return SDValue();

40757

Chain.push_back(V);

40758

V = V.getOperand(0);

40759

do {

40760

switch (V.getOpcode()) {

40761

default:

40762

return SDValue(); // Nothing to combine.

40763

40764

case X86ISD::PSHUFLW:

40765

case X86ISD::PSHUFHW:

40766

if (V.getOpcode() == CombineOp)

40767

break;

40768

40769

Chain.push_back(V);

40770

40771

[[fallthrough]];

40772

case ISD::BITCAST:

40773

V = V.getOperand(0);

40774

continue;

40775

}

40776

break;

40777

} while (V.hasOneUse());

40778

break;

40779

}

40780

// Break out of the loop if we break out of the switch.

40781

break;

40782

}

40783

40784

if (!V.hasOneUse())

40785

// We fell out of the loop without finding a viable combining instruction.

40786

return SDValue();

40787

40788

// Merge this node's mask and our incoming mask.

40789

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

40790

for (int &M : Mask)

40791

M = VMask[M];

40792

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

40793

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

40794

40795

// Rebuild the chain around this new shuffle.

40796

while (!Chain.empty()) {

40797

SDValue W = Chain.pop_back_val();

40798

40799

if (V.getValueType() != W.getOperand(0).getValueType())

40800

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

40801

40802

switch (W.getOpcode()) {

40803

default:

40804

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40804);

40805

40806

case X86ISD::UNPCKL:

40807

case X86ISD::UNPCKH:

40808

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

40809

break;

40810

40811

case X86ISD::PSHUFD:

40812

case X86ISD::PSHUFLW:

40813

case X86ISD::PSHUFHW:

40814

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

40815

break;

40816

}

40817

}

40818

if (V.getValueType() != N.getValueType())

40819

V = DAG.getBitcast(N.getValueType(), V);

40820

40821

// Return the new chain to replace N.

40822

return V;

40823

}

40824

40825

// Attempt to commute shufps LHS loads:

40826

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

40827

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

40828

SelectionDAG &DAG) {

40829

// TODO: Add vXf64 support.

40830

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

40831

return SDValue();

40832

40833

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

40834

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

40835

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

40836

return SDValue();

40837

SDValue N0 = V.getOperand(0);

40838

SDValue N1 = V.getOperand(1);

40839

unsigned Imm = V.getConstantOperandVal(2);

40840

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

40841

if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

40842

X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

40843

return SDValue();

40844

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

40845

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

40846

DAG.getTargetConstant(Imm, DL, MVT::i8));

40847

};

40848

40849

switch (N.getOpcode()) {

40850

case X86ISD::VPERMILPI:

40851

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

40852

unsigned Imm = N.getConstantOperandVal(1);

40853

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

40854

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

40855

}

40856

break;

40857

case X86ISD::SHUFP: {

40858

SDValue N0 = N.getOperand(0);

40859

SDValue N1 = N.getOperand(1);

40860

unsigned Imm = N.getConstantOperandVal(2);

40861

if (N0 == N1) {

40862

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

40863

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

40864

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

40865

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

40866

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

40867

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

40868

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

40869

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

40870

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

40871

}

40872

break;

40873

}

40874

}

40875

40876

return SDValue();

40877

}

40878

40879

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

40880

static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,

40881

const SDLoc &DL) {

40882

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40883

EVT ShuffleVT = N.getValueType();

40884

40885

auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {

40886

// AllZeros/AllOnes constants are freely shuffled and will peek through

40887

// bitcasts. Other constant build vectors do not peek through bitcasts. Only

40888

// merge with target shuffles if it has one use so shuffle combining is

40889

// likely to kick in. Shuffles of splats are expected to be removed.

40890

return ISD::isBuildVectorAllOnes(Op.getNode()) ||

40891

ISD::isBuildVectorAllZeros(Op.getNode()) ||

40892

ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

40893

ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

40894

(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||

40895

(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||

40896

(FoldLoad && isShuffleFoldableLoad(Op)) ||

40897

DAG.isSplatValue(Op, /*AllowUndefs*/ false);

40898

};

40899

auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

40900

// Ensure we only shuffle whole vector src elements, unless its a logical

40901

// binops where we can more aggressively move shuffles from dst to src.

40902

return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||

40903

BinOp == X86ISD::ANDNP ||

40904

(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

40905

};

40906

40907

unsigned Opc = N.getOpcode();

40908

switch (Opc) {

40909

// Unary and Unary+Permute Shuffles.

40910

case X86ISD::PSHUFB: {

40911

// Don't merge PSHUFB if it contains zero'd elements.

40912

SmallVector<int> Mask;

40913

SmallVector<SDValue> Ops;

40914

if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,

40915

Mask))

40916

break;

40917

[[fallthrough]];

40918

}

40919

case X86ISD::VBROADCAST:

40920

case X86ISD::MOVDDUP:

40921

case X86ISD::PSHUFD:

40922

case X86ISD::PSHUFHW:

40923

case X86ISD::PSHUFLW:

40924

case X86ISD::VPERMI:

40925

case X86ISD::VPERMILPI: {

40926

if (N.getOperand(0).getValueType() == ShuffleVT &&

40927

N->isOnlyUserOf(N.getOperand(0).getNode())) {

40928

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

40929

unsigned SrcOpcode = N0.getOpcode();

40930

if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

40931

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

40932

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

40933

if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||

40934

IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {

40935

SDValue LHS, RHS;

40936

Op00 = DAG.getBitcast(ShuffleVT, Op00);

40937

Op01 = DAG.getBitcast(ShuffleVT, Op01);

40938

if (N.getNumOperands() == 2) {

40939

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

40940

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

40941

} else {

40942

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

40943

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

40944

}

40945

EVT OpVT = N0.getValueType();

40946

return DAG.getBitcast(ShuffleVT,

40947

DAG.getNode(SrcOpcode, DL, OpVT,

40948

DAG.getBitcast(OpVT, LHS),

40949

DAG.getBitcast(OpVT, RHS)));

40950

}

40951

}

40952

}

40953

break;

40954

}

40955

// Binary and Binary+Permute Shuffles.

40956

case X86ISD::INSERTPS: {

40957

// Don't merge INSERTPS if it contains zero'd elements.

40958

unsigned InsertPSMask = N.getConstantOperandVal(2);

40959

unsigned ZeroMask = InsertPSMask & 0xF;

40960

if (ZeroMask != 0)

40961

break;

40962

[[fallthrough]];

40963

}

40964

case X86ISD::MOVSD:

40965

case X86ISD::MOVSS:

40966

case X86ISD::BLENDI:

40967

case X86ISD::SHUFP:

40968

case X86ISD::UNPCKH:

40969

case X86ISD::UNPCKL: {

40970

if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

40971

N->isOnlyUserOf(N.getOperand(1).getNode())) {

40972

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

40973

SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

40974

unsigned SrcOpcode = N0.getOpcode();

40975

if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

40976

IsSafeToMoveShuffle(N0, SrcOpcode) &&

40977

IsSafeToMoveShuffle(N1, SrcOpcode)) {

40978

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

40979

SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

40980

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

40981

SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

40982

// Ensure the total number of shuffles doesn't increase by folding this

40983

// shuffle through to the source ops.

40984

if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

40985

(IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

40986

((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

40987

(IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

40988

SDValue LHS, RHS;

40989

Op00 = DAG.getBitcast(ShuffleVT, Op00);

40990

Op10 = DAG.getBitcast(ShuffleVT, Op10);

40991

Op01 = DAG.getBitcast(ShuffleVT, Op01);

40992

Op11 = DAG.getBitcast(ShuffleVT, Op11);

40993

if (N.getNumOperands() == 3) {

40994

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

40995

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

40996

} else {

40997

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

40998

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

40999

}

41000

EVT OpVT = N0.getValueType();

41001

return DAG.getBitcast(ShuffleVT,

41002

DAG.getNode(SrcOpcode, DL, OpVT,

41003

DAG.getBitcast(OpVT, LHS),

41004

DAG.getBitcast(OpVT, RHS)));

41005

}

41006

}

41007

}

41008

break;

41009

}

41010

}

41011

return SDValue();

41012

}

41013

41014

/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).

41015

static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

41016

SelectionDAG &DAG,

41017

const SDLoc &DL) {

41018

assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41018, __extension__
__PRETTY_FUNCTION__));

41019

41020

MVT VT = V.getSimpleValueType();

41021

SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

41022

SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

41023

unsigned SrcOpc0 = Src0.getOpcode();

41024

unsigned SrcOpc1 = Src1.getOpcode();

41025

EVT SrcVT0 = Src0.getValueType();

41026

EVT SrcVT1 = Src1.getValueType();

41027

41028

if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

41029

return SDValue();

41030

41031

switch (SrcOpc0) {

41032

case X86ISD::MOVDDUP: {

41033

SDValue LHS = Src0.getOperand(0);

41034

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41035

SDValue Res =

41036

DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

41037

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

41038

return DAG.getBitcast(VT, Res);

41039

}

41040

case X86ISD::VPERMILPI:

41041

// TODO: Handle v4f64 permutes with different low/high lane masks.

41042

if (SrcVT0 == MVT::v4f64) {

41043

uint64_t Mask = Src0.getConstantOperandVal(1);

41044

if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

41045

break;

41046

}

41047

[[fallthrough]];

41048

case X86ISD::VSHLI:

41049

case X86ISD::VSRLI:

41050

case X86ISD::VSRAI:

41051

case X86ISD::PSHUFD:

41052

if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

41053

SDValue LHS = Src0.getOperand(0);

41054

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41055

SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

41056

V.getOperand(2));

41057

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

41058

return DAG.getBitcast(VT, Res);

41059

}

41060

break;

41061

}

41062

41063

return SDValue();

41064

}

41065

41066

/// Try to combine x86 target specific shuffles.

41067

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

41068

TargetLowering::DAGCombinerInfo &DCI,

41069

const X86Subtarget &Subtarget) {

41070

SDLoc DL(N);

41071

MVT VT = N.getSimpleValueType();

41072

SmallVector<int, 4> Mask;

41073

unsigned Opcode = N.getOpcode();

41074

41075

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

41076

return R;

41077

41078

// Handle specific target shuffles.

41079

switch (Opcode) {

41080

case X86ISD::MOVDDUP: {

41081

SDValue Src = N.getOperand(0);

41082

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

41083

if (VT == MVT::v2f64 && Src.hasOneUse() &&

41084

ISD::isNormalLoad(Src.getNode())) {

41085

LoadSDNode *LN = cast<LoadSDNode>(Src);

41086

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

41087

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

41088

DCI.CombineTo(N.getNode(), Movddup);

41089

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41090

DCI.recursivelyDeleteUnusedNodes(LN);

41091

return N; // Return N so it doesn't get rechecked!

41092

}

41093

}

41094

41095

return SDValue();

41096

}

41097

case X86ISD::VBROADCAST: {

41098

SDValue Src = N.getOperand(0);

41099

SDValue BC = peekThroughBitcasts(Src);

41100

EVT SrcVT = Src.getValueType();

41101

EVT BCVT = BC.getValueType();

41102

41103

// If broadcasting from another shuffle, attempt to simplify it.

41104

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

41105

if (isTargetShuffle(BC.getOpcode()) &&

41106

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

41107

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

41108

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

41109

SM_SentinelUndef);

41110

for (unsigned i = 0; i != Scale; ++i)

41111

DemandedMask[i] = i;

41112

if (SDValue Res = combineX86ShufflesRecursively(

41113

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

41114

X86::MaxShuffleCombineDepth,

41115

/*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,

41116

/*AllowPerLaneVarMask*/ true, DAG, Subtarget))

41117

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41118

DAG.getBitcast(SrcVT, Res));

41119

}

41120

41121

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

41122

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

41123

if (Src.getOpcode() == ISD::BITCAST &&

41124

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

41125

DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

41126

FixedVectorType::isValidElementType(

41127

BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

41128

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

41129

VT.getVectorNumElements());

41130

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41131

}

41132

41133

// vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))

41134

// If we're re-broadcasting a smaller type then broadcast with that type and

41135

// bitcast.

41136

// TODO: Do this for any splat?

41137

if (Src.getOpcode() == ISD::BITCAST &&

41138

(BC.getOpcode() == X86ISD::VBROADCAST ||

41139

BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&

41140

(VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&

41141

(VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {

41142

MVT NewVT =

41143

MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),

41144

VT.getSizeInBits() / BCVT.getScalarSizeInBits());

41145

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41146

}

41147

41148

// Reduce broadcast source vector to lowest 128-bits.

41149

if (SrcVT.getSizeInBits() > 128)

41150

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41151

extract128BitVector(Src, 0, DAG, DL));

41152

41153

// broadcast(scalar_to_vector(x)) -> broadcast(x).

41154

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

41155

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41156

41157

// broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

41158

if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

41159

isNullConstant(Src.getOperand(1)) &&

41160

DAG.getTargetLoweringInfo().isTypeLegal(

41161

Src.getOperand(0).getValueType()))

41162

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41163

41164

// Share broadcast with the longest vector and extract low subvector (free).

41165

// Ensure the same SDValue from the SDNode use is being used.

41166

for (SDNode *User : Src->uses())

41167

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

41168

Src == User->getOperand(0) &&

41169

User->getValueSizeInBits(0).getFixedValue() >

41170

VT.getFixedSizeInBits()) {

41171

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

41172

VT.getSizeInBits());

41173

}

41174

41175

// vbroadcast(scalarload X) -> vbroadcast_load X

41176

// For float loads, extract other uses of the scalar from the broadcast.

41177

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

41178

ISD::isNormalLoad(Src.getNode())) {

41179

LoadSDNode *LN = cast<LoadSDNode>(Src);

41180

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41181

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41182

SDValue BcastLd =

41183

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41184

LN->getMemoryVT(), LN->getMemOperand());

41185

// If the load value is used only by N, replace it via CombineTo N.

41186

bool NoReplaceExtract = Src.hasOneUse();

41187

DCI.CombineTo(N.getNode(), BcastLd);

41188

if (NoReplaceExtract) {

41189

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41190

DCI.recursivelyDeleteUnusedNodes(LN);

41191

} else {

41192

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

41193

DAG.getIntPtrConstant(0, DL));

41194

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

41195

}

41196

return N; // Return N so it doesn't get rechecked!

41197

}

41198

41199

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

41200

// i16. So shrink it ourselves if we can make a broadcast_load.

41201

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

41202

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

41203

assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41203, __extension__
__PRETTY_FUNCTION__));

41204

SDValue TruncIn = Src.getOperand(0);

41205

41206

// If this is a truncate of a non extending load we can just narrow it to

41207

// use a broadcast_load.

41208

if (ISD::isNormalLoad(TruncIn.getNode())) {

41209

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

41210

// Unless its volatile or atomic.

41211

if (LN->isSimple()) {

41212

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41213

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41214

SDValue BcastLd = DAG.getMemIntrinsicNode(

41215

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41216

LN->getPointerInfo(), LN->getOriginalAlign(),

41217

LN->getMemOperand()->getFlags());

41218

DCI.CombineTo(N.getNode(), BcastLd);

41219

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41220

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41221

return N; // Return N so it doesn't get rechecked!

41222

}

41223

}

41224

41225

// If this is a truncate of an i16 extload, we can directly replace it.

41226

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

41227

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

41228

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

41229

if (LN->getMemoryVT().getSizeInBits() == 16) {

41230

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41231

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41232

SDValue BcastLd =

41233

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41234

LN->getMemoryVT(), LN->getMemOperand());

41235

DCI.CombineTo(N.getNode(), BcastLd);

41236

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41237

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41238

return N; // Return N so it doesn't get rechecked!

41239

}

41240

}

41241

41242

// If this is a truncate of load that has been shifted right, we can

41243

// offset the pointer and use a narrower load.

41244

if (TruncIn.getOpcode() == ISD::SRL &&

41245

TruncIn.getOperand(0).hasOneUse() &&

41246

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

41247

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

41248

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

41249

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

41250

// Make sure the shift amount and the load size are divisible by 16.

41251

// Don't do this if the load is volatile or atomic.

41252

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

41253

LN->isSimple()) {

41254

unsigned Offset = ShiftAmt / 8;

41255

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41256

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

41257

TypeSize::Fixed(Offset), DL);

41258

SDValue Ops[] = { LN->getChain(), Ptr };

41259

SDValue BcastLd = DAG.getMemIntrinsicNode(

41260

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41261

LN->getPointerInfo().getWithOffset(Offset),

41262

LN->getOriginalAlign(),

41263

LN->getMemOperand()->getFlags());

41264

DCI.CombineTo(N.getNode(), BcastLd);

41265

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41266

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41267

return N; // Return N so it doesn't get rechecked!

41268

}

41269

}

41270

}

41271

41272

// vbroadcast(vzload X) -> vbroadcast_load X

41273

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

41274

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

41275

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

41276

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41277

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41278

SDValue BcastLd =

41279

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41280

LN->getMemoryVT(), LN->getMemOperand());

41281

DCI.CombineTo(N.getNode(), BcastLd);

41282

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41283

DCI.recursivelyDeleteUnusedNodes(LN);

41284

return N; // Return N so it doesn't get rechecked!

41285

}

41286

}

41287

41288

// vbroadcast(vector load X) -> vbroadcast_load

41289

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

41290

SrcVT == MVT::v4i32) &&

41291

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

41292

LoadSDNode *LN = cast<LoadSDNode>(Src);

41293

// Unless the load is volatile or atomic.

41294

if (LN->isSimple()) {

41295

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41296

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41297

SDValue BcastLd = DAG.getMemIntrinsicNode(

41298

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

41299

LN->getPointerInfo(), LN->getOriginalAlign(),

41300

LN->getMemOperand()->getFlags());

41301

DCI.CombineTo(N.getNode(), BcastLd);

41302

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41303

DCI.recursivelyDeleteUnusedNodes(LN);

41304

return N; // Return N so it doesn't get rechecked!

41305

}

41306

}

41307

41308

return SDValue();

41309

}

41310

case X86ISD::VZEXT_MOVL: {

41311

SDValue N0 = N.getOperand(0);

41312

41313

// If this a vzmovl of a full vector load, replace it with a vzload, unless

41314

// the load is volatile.

41315

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

41316

auto *LN = cast<LoadSDNode>(N0);

41317

if (SDValue VZLoad =

41318

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

41319

DCI.CombineTo(N.getNode(), VZLoad);

41320

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41321

DCI.recursivelyDeleteUnusedNodes(LN);

41322

return N;

41323

}

41324

}

41325

41326

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

41327

// and can just use a VZEXT_LOAD.

41328

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

41329

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

41330

auto *LN = cast<MemSDNode>(N0);

41331

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

41332

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41333

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41334

SDValue VZLoad =

41335

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

41336

LN->getMemoryVT(), LN->getMemOperand());

41337

DCI.CombineTo(N.getNode(), VZLoad);

41338

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41339

DCI.recursivelyDeleteUnusedNodes(LN);

41340

return N;

41341

}

41342

}

41343

41344

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

41345

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

41346

// if the upper bits of the i64 are zero.

41347

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

41348

N0.getOperand(0).hasOneUse() &&

41349

N0.getOperand(0).getValueType() == MVT::i64) {

41350

SDValue In = N0.getOperand(0);

41351

APInt Mask = APInt::getHighBitsSet(64, 32);

41352

if (DAG.MaskedValueIsZero(In, Mask)) {

41353

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

41354

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

41355

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

41356

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

41357

return DAG.getBitcast(VT, Movl);

41358

}

41359

}

41360

41361

// Load a scalar integer constant directly to XMM instead of transferring an

41362

// immediate value from GPR.

41363

// vzext_movl (scalar_to_vector C) --> load [C,0...]

41364

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

41365

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

41366

// Create a vector constant - scalar constant followed by zeros.

41367

EVT ScalarVT = N0.getOperand(0).getValueType();

41368

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

41369

unsigned NumElts = VT.getVectorNumElements();

41370

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

41371

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

41372

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

41373

41374

// Load the vector constant from constant pool.

41375

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

41376

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

41377

MachinePointerInfo MPI =

41378

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

41379

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

41380

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

41381

MachineMemOperand::MOLoad);

41382

}

41383

}

41384

41385

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

41386

// insert into a zero vector. This helps get VZEXT_MOVL closer to

41387

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

41388

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

41389

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

41390

SDValue V = peekThroughOneUseBitcasts(N0);

41391

41392

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

41393

isNullConstant(V.getOperand(2))) {

41394

SDValue In = V.getOperand(1);

41395

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

41396

In.getValueSizeInBits() /

41397

VT.getScalarSizeInBits());

41398

In = DAG.getBitcast(SubVT, In);

41399

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

41400

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

41401

getZeroVector(VT, Subtarget, DAG, DL), Movl,

41402

V.getOperand(2));

41403

}

41404

}

41405

41406

return SDValue();

41407

}

41408

case X86ISD::BLENDI: {

41409

SDValue N0 = N.getOperand(0);

41410

SDValue N1 = N.getOperand(1);

41411

41412

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

41413

// TODO: Handle MVT::v16i16 repeated blend mask.

41414

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

41415

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

41416

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

41417

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

41418

SrcVT.getScalarSizeInBits() >= 32) {

41419

unsigned BlendMask = N.getConstantOperandVal(2);

41420

unsigned Size = VT.getVectorNumElements();

41421

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

41422

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

41423

return DAG.getBitcast(

41424

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

41425

N1.getOperand(0),

41426

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

41427

}

41428

}

41429

return SDValue();

41430

}

41431

case X86ISD::SHUFP: {

41432

// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

41433

// This is a more relaxed shuffle combiner that can ignore oneuse limits.

41434

// TODO: Support types other than v4f32.

41435

if (VT == MVT::v4f32) {

41436

bool Updated = false;

41437

SmallVector<int> Mask;

41438

SmallVector<SDValue> Ops;

41439

if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&

41440

Ops.size() == 2) {

41441

for (int i = 0; i != 2; ++i) {

41442

SmallVector<SDValue> SubOps;

41443

SmallVector<int> SubMask, SubScaledMask;

41444

SDValue Sub = peekThroughBitcasts(Ops[i]);

41445

// TODO: Scaling might be easier if we specify the demanded elts.

41446

if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

41447

scaleShuffleElements(SubMask, 4, SubScaledMask) &&

41448

SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

41449

int Ofs = i * 2;

41450

Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

41451

Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

41452

Ops[i] = DAG.getBitcast(VT, SubOps[0]);

41453

Updated = true;

41454

}

41455

}

41456

}

41457

if (Updated) {

41458

for (int &M : Mask)

41459

M %= 4;

41460

Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

41461

return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

41462

}

41463

}

41464

return SDValue();

41465

}

41466

case X86ISD::VPERMI: {

41467

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

41468

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

41469

SDValue N0 = N.getOperand(0);

41470

SDValue N1 = N.getOperand(1);

41471

unsigned EltSizeInBits = VT.getScalarSizeInBits();

41472

if (N0.getOpcode() == ISD::BITCAST &&

41473

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

41474

SDValue Src = N0.getOperand(0);

41475

EVT SrcVT = Src.getValueType();

41476

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

41477

return DAG.getBitcast(VT, Res);

41478

}

41479

return SDValue();

41480

}

41481

case X86ISD::VPERM2X128: {

41482

// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

41483

SDValue LHS = N->getOperand(0);

41484

SDValue RHS = N->getOperand(1);

41485

if (LHS.getOpcode() == ISD::BITCAST &&

41486

(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

41487

EVT SrcVT = LHS.getOperand(0).getValueType();

41488

if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

41489

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

41490

DAG.getBitcast(SrcVT, LHS),

41491

DAG.getBitcast(SrcVT, RHS),

41492

N->getOperand(2)));

41493

}

41494

}

41495

41496

// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

41497

if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

41498

return Res;

41499

41500

// Fold vperm2x128 subvector shuffle with an inner concat pattern.

41501

// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

41502

auto FindSubVector128 = [&](unsigned Idx) {

41503

if (Idx > 3)

41504

return SDValue();

41505

SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

41506

SmallVector<SDValue> SubOps;

41507

if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)

41508

return SubOps[Idx & 1];

41509

unsigned NumElts = Src.getValueType().getVectorNumElements();

41510

if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

41511

Src.getOperand(1).getValueSizeInBits() == 128 &&

41512

Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

41513

return Src.getOperand(1);

41514

}

41515

return SDValue();

41516

};

41517

unsigned Imm = N.getConstantOperandVal(2);

41518

if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

41519

if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

41520

MVT SubVT = VT.getHalfNumVectorElementsVT();

41521

SubLo = DAG.getBitcast(SubVT, SubLo);

41522

SubHi = DAG.getBitcast(SubVT, SubHi);

41523

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

41524

}

41525

}

41526

return SDValue();

41527

}

41528

case X86ISD::PSHUFD:

41529

case X86ISD::PSHUFLW:

41530

case X86ISD::PSHUFHW:

41531

Mask = getPSHUFShuffleMask(N);

41532

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41532, __extension__ __PRETTY_FUNCTION__));

41533

break;

41534

case X86ISD::MOVSD:

41535

case X86ISD::MOVSH:

41536

case X86ISD::MOVSS: {

41537

SDValue N0 = N.getOperand(0);

41538

SDValue N1 = N.getOperand(1);

41539

41540

// Canonicalize scalar FPOps:

41541

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

41542

// If commutable, allow OP(N1[0], N0[0]).

41543

unsigned Opcode1 = N1.getOpcode();

41544

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

41545

Opcode1 == ISD::FDIV) {

41546

SDValue N10 = N1.getOperand(0);

41547

SDValue N11 = N1.getOperand(1);

41548

if (N10 == N0 ||

41549

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

41550

if (N10 != N0)

41551

std::swap(N10, N11);

41552

MVT SVT = VT.getVectorElementType();

41553

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

41554

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

41555

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

41556

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

41557

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

41558

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

41559

}

41560

}

41561

41562

return SDValue();

41563

}

41564

case X86ISD::INSERTPS: {

41565

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41565, __extension__
__PRETTY_FUNCTION__));

41566

SDValue Op0 = N.getOperand(0);

41567

SDValue Op1 = N.getOperand(1);

41568

unsigned InsertPSMask = N.getConstantOperandVal(2);

41569

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

41570

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

41571

unsigned ZeroMask = InsertPSMask & 0xF;

41572

41573

// If we zero out all elements from Op0 then we don't need to reference it.

41574

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

41575

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

41576

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41577

41578

// If we zero out the element from Op1 then we don't need to reference it.

41579

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

41580

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

41581

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41582

41583

// Attempt to merge insertps Op1 with an inner target shuffle node.

41584

SmallVector<int, 8> TargetMask1;

41585

SmallVector<SDValue, 2> Ops1;

41586

APInt KnownUndef1, KnownZero1;

41587

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

41588

KnownZero1)) {

41589

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

41590

// Zero/UNDEF insertion - zero out element and remove dependency.

41591

InsertPSMask |= (1u << DstIdx);

41592

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

41593

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41594

}

41595

// Update insertps mask srcidx and reference the source input directly.

41596

int M = TargetMask1[SrcIdx];

41597

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41597, __extension__
__PRETTY_FUNCTION__));

41598

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

41599

Op1 = Ops1[M < 4 ? 0 : 1];

41600

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

41601

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41602

}

41603

41604

// Attempt to merge insertps Op0 with an inner target shuffle node.

41605

SmallVector<int, 8> TargetMask0;

41606

SmallVector<SDValue, 2> Ops0;

41607

APInt KnownUndef0, KnownZero0;

41608

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

41609

KnownZero0)) {

41610

bool Updated = false;

41611

bool UseInput00 = false;

41612

bool UseInput01 = false;

41613

for (int i = 0; i != 4; ++i) {

41614

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

41615

// No change if element is already zero or the inserted element.

41616

continue;

41617

}

41618

41619

if (KnownUndef0[i] || KnownZero0[i]) {

41620

// If the target mask is undef/zero then we must zero the element.

41621

InsertPSMask |= (1u << i);

41622

Updated = true;

41623

continue;

41624

}

41625

41626

// The input vector element must be inline.

41627

int M = TargetMask0[i];

41628

if (M != i && M != (i + 4))

41629

return SDValue();

41630

41631

// Determine which inputs of the target shuffle we're using.

41632

UseInput00 |= (0 <= M && M < 4);

41633

UseInput01 |= (4 <= M);

41634

}

41635

41636

// If we're not using both inputs of the target shuffle then use the

41637

// referenced input directly.

41638

if (UseInput00 && !UseInput01) {

41639

Updated = true;

41640

Op0 = Ops0[0];

41641

} else if (!UseInput00 && UseInput01) {

41642

Updated = true;

41643

Op0 = Ops0[1];

41644

}

41645

41646

if (Updated)

41647

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

41648

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41649

}

41650

41651

// If we're inserting an element from a vbroadcast load, fold the

41652

// load into the X86insertps instruction. We need to convert the scalar

41653

// load to a vector and clear the source lane of the INSERTPS control.

41654

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

41655

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

41656

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

41657

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

41658

MemIntr->getBasePtr(),

41659

MemIntr->getMemOperand());

41660

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

41661

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

41662

Load),

41663

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

41664

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

41665

return Insert;

41666

}

41667

}

41668

41669

return SDValue();

41670

}

41671

default:

41672

return SDValue();

41673

}

41674

41675

// Nuke no-op shuffles that show up after combining.

41676

if (isNoopShuffleMask(Mask))

41677

return N.getOperand(0);

41678

41679

// Look for simplifications involving one or two shuffle instructions.

41680

SDValue V = N.getOperand(0);

41681

switch (N.getOpcode()) {

41682

default:

41683

break;

41684

case X86ISD::PSHUFLW:

41685

case X86ISD::PSHUFHW:

41686

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41686, __extension__
__PRETTY_FUNCTION__));

41687

41688

// See if this reduces to a PSHUFD which is no more expensive and can

41689

// combine with more operations. Note that it has to at least flip the

41690

// dwords as otherwise it would have been removed as a no-op.

41691

if (ArrayRef(Mask).equals({2, 3, 0, 1})) {

41692

int DMask[] = {0, 1, 2, 3};

41693

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

41694

DMask[DOffset + 0] = DOffset + 1;

41695

DMask[DOffset + 1] = DOffset + 0;

41696

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

41697

V = DAG.getBitcast(DVT, V);

41698

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

41699

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

41700

return DAG.getBitcast(VT, V);

41701

}

41702

41703

// Look for shuffle patterns which can be implemented as a single unpack.

41704

// FIXME: This doesn't handle the location of the PSHUFD generically, and

41705

// only works when we have a PSHUFD followed by two half-shuffles.

41706

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

41707

(V.getOpcode() == X86ISD::PSHUFLW ||

41708

V.getOpcode() == X86ISD::PSHUFHW) &&

41709

V.getOpcode() != N.getOpcode() &&

41710

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

41711

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

41712

if (D.getOpcode() == X86ISD::PSHUFD) {

41713

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

41714

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

41715

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

41716

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

41717

int WordMask[8];

41718

for (int i = 0; i < 4; ++i) {

41719

WordMask[i + NOffset] = Mask[i] + NOffset;

41720

WordMask[i + VOffset] = VMask[i] + VOffset;

41721

}

41722

// Map the word mask through the DWord mask.

41723

int MappedMask[8];

41724

for (int i = 0; i < 8; ++i)

41725

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

41726

if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

41727

ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

41728

// We can replace all three shuffles with an unpack.

41729

V = DAG.getBitcast(VT, D.getOperand(0));

41730

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

41731

: X86ISD::UNPCKH,

41732

DL, VT, V, V);

41733

}

41734

}

41735

}

41736

41737

break;

41738

41739

case X86ISD::PSHUFD:

41740

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

41741

return NewN;

41742

41743

break;

41744

}

41745

41746

return SDValue();

41747

}

41748

41749

/// Checks if the shuffle mask takes subsequent elements

41750

/// alternately from two vectors.

41751

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

41752

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

41753

41754

int ParitySrc[2] = {-1, -1};

41755

unsigned Size = Mask.size();

41756

for (unsigned i = 0; i != Size; ++i) {

41757

int M = Mask[i];

41758

if (M < 0)

41759

continue;

41760

41761

// Make sure we are using the matching element from the input.

41762

if ((M % Size) != i)

41763

return false;

41764

41765

// Make sure we use the same input for all elements of the same parity.

41766

int Src = M / Size;

41767

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

41768

return false;

41769

ParitySrc[i % 2] = Src;

41770

}

41771

41772

// Make sure each input is used.

41773

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

41774

return false;

41775

41776

Op0Even = ParitySrc[0] == 0;

41777

return true;

41778

}

41779

41780

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

41781

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

41782

/// are written to the parameters \p Opnd0 and \p Opnd1.

41783

///

41784

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

41785

/// so it is easier to generically match. We also insert dummy vector shuffle

41786

/// nodes for the operands which explicitly discard the lanes which are unused

41787

/// by this operation to try to flow through the rest of the combiner

41788

/// the fact that they're unused.

41789

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

41790

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

41791

bool &IsSubAdd) {

41792

41793

EVT VT = N->getValueType(0);

41794

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41795

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

41796

!VT.getSimpleVT().isFloatingPoint())

41797

return false;

41798

41799

// We only handle target-independent shuffles.

41800

// FIXME: It would be easy and harmless to use the target shuffle mask

41801

// extraction tool to support more.

41802

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

41803

return false;

41804

41805

SDValue V1 = N->getOperand(0);

41806

SDValue V2 = N->getOperand(1);

41807

41808

// Make sure we have an FADD and an FSUB.

41809

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

41810

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

41811

V1.getOpcode() == V2.getOpcode())

41812

return false;

41813

41814

// If there are other uses of these operations we can't fold them.

41815

if (!V1->hasOneUse() || !V2->hasOneUse())

41816

return false;

41817

41818

// Ensure that both operations have the same operands. Note that we can

41819

// commute the FADD operands.

41820

SDValue LHS, RHS;

41821

if (V1.getOpcode() == ISD::FSUB) {

41822

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

41823

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

41824

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

41825

return false;

41826

} else {

41827

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41827, __extension__
__PRETTY_FUNCTION__));

41828

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

41829

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

41830

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

41831

return false;

41832

}

41833

41834

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

41835

bool Op0Even;

41836

if (!isAddSubOrSubAddMask(Mask, Op0Even))

41837

return false;

41838

41839

// It's a subadd if the vector in the even parity is an FADD.

41840

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

41841

: V2->getOpcode() == ISD::FADD;

41842

41843

Opnd0 = LHS;

41844

Opnd1 = RHS;

41845

return true;

41846

}

41847

41848

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

41849

static SDValue combineShuffleToFMAddSub(SDNode *N,

41850

const X86Subtarget &Subtarget,

41851

SelectionDAG &DAG) {

41852

// We only handle target-independent shuffles.

41853

// FIXME: It would be easy and harmless to use the target shuffle mask

41854

// extraction tool to support more.

41855

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

41856

return SDValue();

41857

41858

MVT VT = N->getSimpleValueType(0);

41859

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41860

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

41861

return SDValue();

41862

41863

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

41864

SDValue Op0 = N->getOperand(0);

41865

SDValue Op1 = N->getOperand(1);

41866

SDValue FMAdd = Op0, FMSub = Op1;

41867

if (FMSub.getOpcode() != X86ISD::FMSUB)

41868

std::swap(FMAdd, FMSub);

41869

41870

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

41871

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

41872

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

41873

FMAdd.getOperand(2) != FMSub.getOperand(2))

41874

return SDValue();

41875

41876

// Check for correct shuffle mask.

41877

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

41878

bool Op0Even;

41879

if (!isAddSubOrSubAddMask(Mask, Op0Even))

41880

return SDValue();

41881

41882

// FMAddSub takes zeroth operand from FMSub node.

41883

SDLoc DL(N);

41884

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

41885

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

41886

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

41887

FMAdd.getOperand(2));

41888

}

41889

41890

/// Try to combine a shuffle into a target-specific add-sub or

41891

/// mul-add-sub node.

41892

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

41893

const X86Subtarget &Subtarget,

41894

SelectionDAG &DAG) {

41895

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

41896

return V;

41897

41898

SDValue Opnd0, Opnd1;

41899

bool IsSubAdd;

41900

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

41901

return SDValue();

41902

41903

MVT VT = N->getSimpleValueType(0);

41904

SDLoc DL(N);

41905

41906

// Try to generate X86ISD::FMADDSUB node here.

41907

SDValue Opnd2;

41908

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

41909

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

41910

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

41911

}

41912

41913

if (IsSubAdd)

41914

return SDValue();

41915

41916

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

41917

// the ADDSUB idiom has been successfully recognized. There are no known

41918

// X86 targets with 512-bit ADDSUB instructions!

41919

if (VT.is512BitVector())

41920

return SDValue();

41921

41922

// Do not generate X86ISD::ADDSUB node for FP16's vector types even though

41923

// the ADDSUB idiom has been successfully recognized. There are no known

41924

// X86 targets with FP16 ADDSUB instructions!

41925

if (VT.getVectorElementType() == MVT::f16)

41926

return SDValue();

41927

41928

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

41929

}

41930

41931

// We are looking for a shuffle where both sources are concatenated with undef

41932

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

41933

// if we can express this as a single-source shuffle, that's preferable.

41934

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

41935

const X86Subtarget &Subtarget) {

41936

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

41937

return SDValue();

41938

41939

EVT VT = N->getValueType(0);

41940

41941

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

41942

if (!VT.is128BitVector() && !VT.is256BitVector())

41943

return SDValue();

41944

41945

if (VT.getVectorElementType() != MVT::i32 &&

41946

VT.getVectorElementType() != MVT::i64 &&

41947

VT.getVectorElementType() != MVT::f32 &&

41948

VT.getVectorElementType() != MVT::f64)

41949

return SDValue();

41950

41951

SDValue N0 = N->getOperand(0);

41952

SDValue N1 = N->getOperand(1);

41953

41954

// Check that both sources are concats with undef.

41955

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

41956

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

41957

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

41958

!N1.getOperand(1).isUndef())

41959

return SDValue();

41960

41961

// Construct the new shuffle mask. Elements from the first source retain their

41962

// index, but elements from the second source no longer need to skip an undef.

41963

SmallVector<int, 8> Mask;

41964

int NumElts = VT.getVectorNumElements();

41965

41966

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

41967

for (int Elt : SVOp->getMask())

41968

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

41969

41970

SDLoc DL(N);

41971

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

41972

N1.getOperand(0));

41973

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

41974

}

41975

41976

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

41977

/// low half of each source vector and does not set any high half elements in

41978

/// the destination vector, narrow the shuffle to half its original size.

41979

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

41980

if (!Shuf->getValueType(0).isSimple())

41981

return SDValue();

41982

MVT VT = Shuf->getSimpleValueType(0);

41983

if (!VT.is256BitVector() && !VT.is512BitVector())

41984

return SDValue();

41985

41986

// See if we can ignore all of the high elements of the shuffle.

41987

ArrayRef<int> Mask = Shuf->getMask();

41988

if (!isUndefUpperHalf(Mask))

41989

return SDValue();

41990

41991

// Check if the shuffle mask accesses only the low half of each input vector

41992

// (half-index output is 0 or 2).

41993

int HalfIdx1, HalfIdx2;

41994

SmallVector<int, 8> HalfMask(Mask.size() / 2);

41995

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

41996

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

41997

return SDValue();

41998

41999

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

42000

// The trick is knowing that all of the insert/extract are actually free

42001

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

42002

// of narrow inputs into a narrow output, and that is always cheaper than

42003

// the wide shuffle that we started with.

42004

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

42005

Shuf->getOperand(1), HalfMask, HalfIdx1,

42006

HalfIdx2, false, DAG, /*UseConcat*/true);

42007

}

42008

42009

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

42010

TargetLowering::DAGCombinerInfo &DCI,

42011

const X86Subtarget &Subtarget) {

42012

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

42013

if (SDValue V = narrowShuffle(Shuf, DAG))

42014

return V;

42015

42016

// If we have legalized the vector types, look for blends of FADD and FSUB

42017

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

42018

SDLoc dl(N);

42019

EVT VT = N->getValueType(0);

42020

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42021

if (TLI.isTypeLegal(VT))

42022

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

42023

return AddSub;

42024

42025

// Attempt to combine into a vector load/broadcast.

42026

if (SDValue LD = combineToConsecutiveLoads(

42027

VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

42028

return LD;

42029

42030

// For AVX2, we sometimes want to combine

42031

// (vector_shuffle <mask> (concat_vectors t1, undef)

42032

// (concat_vectors t2, undef))

42033

// Into:

42034

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

42035

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

42036

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

42037

return ShufConcat;

42038

42039

if (isTargetShuffle(N->getOpcode())) {

42040

SDValue Op(N, 0);

42041

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

42042

return Shuffle;

42043

42044

// Try recursively combining arbitrary sequences of x86 shuffle

42045

// instructions into higher-order shuffles. We do this after combining

42046

// specific PSHUF instruction sequences into their minimal form so that we

42047

// can evaluate how many specialized shuffle instructions are involved in

42048

// a particular chain.

42049

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42050

return Res;

42051

42052

// Simplify source operands based on shuffle mask.

42053

// TODO - merge this into combineX86ShufflesRecursively.

42054

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

42055

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

42056

return SDValue(N, 0);

42057

42058

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

42059

// Perform this after other shuffle combines to allow inner shuffles to be

42060

// combined away first.

42061

if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))

42062

return BinOp;

42063

}

42064

42065

return SDValue();

42066

}

42067

42068

// Simplify variable target shuffle masks based on the demanded elements.

42069

// TODO: Handle DemandedBits in mask indices as well?

42070

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

42071

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

42072

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

42073

// If we're demanding all elements don't bother trying to simplify the mask.

42074

unsigned NumElts = DemandedElts.getBitWidth();

42075

if (DemandedElts.isAllOnes())

42076

return false;

42077

42078

SDValue Mask = Op.getOperand(MaskIndex);

42079

if (!Mask.hasOneUse())

42080

return false;

42081

42082

// Attempt to generically simplify the variable shuffle mask.

42083

APInt MaskUndef, MaskZero;

42084

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

42085

Depth + 1))

42086

return true;

42087

42088

// Attempt to extract+simplify a (constant pool load) shuffle mask.

42089

// TODO: Support other types from getTargetShuffleMaskIndices?

42090

SDValue BC = peekThroughOneUseBitcasts(Mask);

42091

EVT BCVT = BC.getValueType();

42092

auto *Load = dyn_cast<LoadSDNode>(BC);

42093

if (!Load)

42094

return false;

42095

42096

const Constant *C = getTargetConstantFromNode(Load);

42097

if (!C)

42098

return false;

42099

42100

Type *CTy = C->getType();

42101

if (!CTy->isVectorTy() ||

42102

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

42103

return false;

42104

42105

// Handle scaling for i64 elements on 32-bit targets.

42106

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

42107

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

42108

return false;

42109

unsigned Scale = NumCstElts / NumElts;

42110

42111

// Simplify mask if we have an undemanded element that is not undef.

42112

bool Simplified = false;

42113

SmallVector<Constant *, 32> ConstVecOps;

42114

for (unsigned i = 0; i != NumCstElts; ++i) {

42115

Constant *Elt = C->getAggregateElement(i);

42116

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

42117

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

42118

Simplified = true;

42119

continue;

42120

}

42121

ConstVecOps.push_back(Elt);

42122

}

42123

if (!Simplified)

42124

return false;

42125

42126

// Generate new constant pool entry + legalize immediately for the load.

42127

SDLoc DL(Op);

42128

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

42129

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

42130

SDValue NewMask = TLO.DAG.getLoad(

42131

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

42132

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

42133

Load->getAlign());

42134

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

42135

}

42136

42137

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

42138

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

42139

TargetLoweringOpt &TLO, unsigned Depth) const {

42140

int NumElts = DemandedElts.getBitWidth();

42141

unsigned Opc = Op.getOpcode();

42142

EVT VT = Op.getValueType();

42143

42144

// Handle special case opcodes.

42145

switch (Opc) {

42146

case X86ISD::PMULDQ:

42147

case X86ISD::PMULUDQ: {

42148

APInt LHSUndef, LHSZero;

42149

APInt RHSUndef, RHSZero;

42150

SDValue LHS = Op.getOperand(0);

42151

SDValue RHS = Op.getOperand(1);

42152

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42153

Depth + 1))

42154

return true;

42155

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42156

Depth + 1))

42157

return true;

42158

// Multiply by zero.

42159

KnownZero = LHSZero | RHSZero;

42160

break;

42161

}

42162

case X86ISD::VPMADDWD: {

42163

APInt LHSUndef, LHSZero;

42164

APInt RHSUndef, RHSZero;

42165

SDValue LHS = Op.getOperand(0);

42166

SDValue RHS = Op.getOperand(1);

42167

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);

42168

42169

if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

42170

Depth + 1))

42171

return true;

42172

if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

42173

Depth + 1))

42174

return true;

42175

42176

// TODO: Multiply by zero.

42177

42178

// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

42179

APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

42180

if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

42181

Depth + 1))

42182

return true;

42183

APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

42184

if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

42185

Depth + 1))

42186

return true;

42187

break;

42188

}

42189

case X86ISD::PSADBW: {

42190

SDValue LHS = Op.getOperand(0);

42191

SDValue RHS = Op.getOperand(1);

42192

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42195, __extension__
__PRETTY_FUNCTION__))

42193

LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42195, __extension__
__PRETTY_FUNCTION__))

42194

LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42195, __extension__
__PRETTY_FUNCTION__))

42195

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42195, __extension__
__PRETTY_FUNCTION__));

42196

42197

// Aggressively peek through ops to get at the demanded elts.

42198

if (!DemandedElts.isAllOnes()) {

42199

unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

42200

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

42201

SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

42202

LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42203

SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

42204

RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42205

if (NewLHS || NewRHS) {

42206

NewLHS = NewLHS ? NewLHS : LHS;

42207

NewRHS = NewRHS ? NewRHS : RHS;

42208

return TLO.CombineTo(

42209

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42210

}

42211

}

42212

break;

42213

}

42214

case X86ISD::VSHL:

42215

case X86ISD::VSRL:

42216

case X86ISD::VSRA: {

42217

// We only need the bottom 64-bits of the (128-bit) shift amount.

42218

SDValue Amt = Op.getOperand(1);

42219

MVT AmtVT = Amt.getSimpleValueType();

42220

assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42220, __extension__
__PRETTY_FUNCTION__));

42221

42222

// If we reuse the shift amount just for sse shift amounts then we know that

42223

// only the bottom 64-bits are only ever used.

42224

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

42225

unsigned UseOpc = Use->getOpcode();

42226

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

42227

UseOpc == X86ISD::VSRA) &&

42228

Use->getOperand(0) != Amt;

42229

});

42230

42231

APInt AmtUndef, AmtZero;

42232

unsigned NumAmtElts = AmtVT.getVectorNumElements();

42233

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

42234

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

42235

Depth + 1, AssumeSingleUse))

42236

return true;

42237

[[fallthrough]];

42238

}

42239

case X86ISD::VSHLI:

42240

case X86ISD::VSRLI:

42241

case X86ISD::VSRAI: {

42242

SDValue Src = Op.getOperand(0);

42243

APInt SrcUndef;

42244

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

42245

Depth + 1))

42246

return true;

42247

42248

// Fold shift(0,x) -> 0

42249

if (DemandedElts.isSubsetOf(KnownZero))

42250

return TLO.CombineTo(

42251

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42252

42253

// Aggressively peek through ops to get at the demanded elts.

42254

if (!DemandedElts.isAllOnes())

42255

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

42256

Src, DemandedElts, TLO.DAG, Depth + 1))

42257

return TLO.CombineTo(

42258

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

42259

break;

42260

}

42261

case X86ISD::VPSHA:

42262

case X86ISD::VPSHL:

42263

case X86ISD::VSHLV:

42264

case X86ISD::VSRLV:

42265

case X86ISD::VSRAV: {

42266

APInt LHSUndef, LHSZero;

42267

APInt RHSUndef, RHSZero;

42268

SDValue LHS = Op.getOperand(0);

42269

SDValue RHS = Op.getOperand(1);

42270

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42271

Depth + 1))

42272

return true;

42273

42274

// Fold shift(0,x) -> 0

42275

if (DemandedElts.isSubsetOf(LHSZero))

42276

return TLO.CombineTo(

42277

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42278

42279

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42280

Depth + 1))

42281

return true;

42282

42283

KnownZero = LHSZero;

42284

break;

42285

}

42286

case X86ISD::KSHIFTL: {

42287

SDValue Src = Op.getOperand(0);

42288

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42289

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42289, __extension__
__PRETTY_FUNCTION__));

42290

unsigned ShiftAmt = Amt->getZExtValue();

42291

42292

if (ShiftAmt == 0)

42293

return TLO.CombineTo(Op, Src);

42294

42295

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

42296

// single shift. We can do this if the bottom bits (which are shifted

42297

// out) are never demanded.

42298

if (Src.getOpcode() == X86ISD::KSHIFTR) {

42299

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

42300

unsigned C1 = Src.getConstantOperandVal(1);

42301

unsigned NewOpc = X86ISD::KSHIFTL;

42302

int Diff = ShiftAmt - C1;

42303

if (Diff < 0) {

42304

Diff = -Diff;

42305

NewOpc = X86ISD::KSHIFTR;

42306

}

42307

42308

SDLoc dl(Op);

42309

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42310

return TLO.CombineTo(

42311

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42312

}

42313

}

42314

42315

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

42316

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42317

Depth + 1))

42318

return true;

42319

42320

KnownUndef <<= ShiftAmt;

42321

KnownZero <<= ShiftAmt;

42322

KnownZero.setLowBits(ShiftAmt);

42323

break;

42324

}

42325

case X86ISD::KSHIFTR: {

42326

SDValue Src = Op.getOperand(0);

42327

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42328

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42328, __extension__
__PRETTY_FUNCTION__));

42329

unsigned ShiftAmt = Amt->getZExtValue();

42330

42331

if (ShiftAmt == 0)

42332

return TLO.CombineTo(Op, Src);

42333

42334

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

42335

// single shift. We can do this if the top bits (which are shifted

42336

// out) are never demanded.

42337

if (Src.getOpcode() == X86ISD::KSHIFTL) {

42338

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

42339

unsigned C1 = Src.getConstantOperandVal(1);

42340

unsigned NewOpc = X86ISD::KSHIFTR;

42341

int Diff = ShiftAmt - C1;

42342

if (Diff < 0) {

42343

Diff = -Diff;

42344

NewOpc = X86ISD::KSHIFTL;

42345

}

42346

42347

SDLoc dl(Op);

42348

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42349

return TLO.CombineTo(

42350

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42351

}

42352

}

42353

42354

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

42355

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42356

Depth + 1))

42357

return true;

42358

42359

KnownUndef.lshrInPlace(ShiftAmt);

42360

KnownZero.lshrInPlace(ShiftAmt);

42361

KnownZero.setHighBits(ShiftAmt);

42362

break;

42363

}

42364

case X86ISD::ANDNP: {

42365

// ANDNP = (~LHS & RHS);

42366

SDValue LHS = Op.getOperand(0);

42367

SDValue RHS = Op.getOperand(1);

42368

42369

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

42370

APInt UndefElts;

42371

SmallVector<APInt> EltBits;

42372

int NumElts = VT.getVectorNumElements();

42373

int EltSizeInBits = VT.getScalarSizeInBits();

42374

APInt OpBits = APInt::getAllOnes(EltSizeInBits);

42375

APInt OpElts = DemandedElts;

42376

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

42377

EltBits)) {

42378

OpBits.clearAllBits();

42379

OpElts.clearAllBits();

42380

for (int I = 0; I != NumElts; ++I) {

42381

if (!DemandedElts[I])

42382

continue;

42383

if (UndefElts[I]) {

42384

// We can't assume an undef src element gives an undef dst - the

42385

// other src might be zero.

42386

OpBits.setAllBits();

42387

OpElts.setBit(I);

42388

} else if ((Invert && !EltBits[I].isAllOnes()) ||

42389

(!Invert && !EltBits[I].isZero())) {

42390

OpBits |= Invert ? ~EltBits[I] : EltBits[I];

42391

OpElts.setBit(I);

42392

}

42393

}

42394

}

42395

return std::make_pair(OpBits, OpElts);

42396

};

42397

APInt BitsLHS, EltsLHS;

42398

APInt BitsRHS, EltsRHS;

42399

std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);

42400

std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);

42401

42402

APInt LHSUndef, LHSZero;

42403

APInt RHSUndef, RHSZero;

42404

if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,

42405

Depth + 1))

42406

return true;

42407

if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,

42408

Depth + 1))

42409

return true;

42410

42411

if (!DemandedElts.isAllOnes()) {

42412

SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,

42413

TLO.DAG, Depth + 1);

42414

SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,

42415

TLO.DAG, Depth + 1);

42416

if (NewLHS || NewRHS) {

42417

NewLHS = NewLHS ? NewLHS : LHS;

42418

NewRHS = NewRHS ? NewRHS : RHS;

42419

return TLO.CombineTo(

42420

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42421

}

42422

}

42423

break;

42424

}

42425

case X86ISD::CVTSI2P:

42426

case X86ISD::CVTUI2P: {

42427

SDValue Src = Op.getOperand(0);

42428

MVT SrcVT = Src.getSimpleValueType();

42429

APInt SrcUndef, SrcZero;

42430

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

42431

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

42432

Depth + 1))

42433

return true;

42434

break;

42435

}

42436

case X86ISD::PACKSS:

42437

case X86ISD::PACKUS: {

42438

SDValue N0 = Op.getOperand(0);

42439

SDValue N1 = Op.getOperand(1);

42440

42441

APInt DemandedLHS, DemandedRHS;

42442

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42443

42444

APInt LHSUndef, LHSZero;

42445

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42446

Depth + 1))

42447

return true;

42448

APInt RHSUndef, RHSZero;

42449

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

42450

Depth + 1))

42451

return true;

42452

42453

// TODO - pass on known zero/undef.

42454

42455

// Aggressively peek through ops to get at the demanded elts.

42456

// TODO - we should do this for all target/faux shuffles ops.

42457

if (!DemandedElts.isAllOnes()) {

42458

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

42459

TLO.DAG, Depth + 1);

42460

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

42461

TLO.DAG, Depth + 1);

42462

if (NewN0 || NewN1) {

42463

NewN0 = NewN0 ? NewN0 : N0;

42464

NewN1 = NewN1 ? NewN1 : N1;

42465

return TLO.CombineTo(Op,

42466

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

42467

}

42468

}

42469

break;

42470

}

42471

case X86ISD::HADD:

42472

case X86ISD::HSUB:

42473

case X86ISD::FHADD:

42474

case X86ISD::FHSUB: {

42475

SDValue N0 = Op.getOperand(0);

42476

SDValue N1 = Op.getOperand(1);

42477

42478

APInt DemandedLHS, DemandedRHS;

42479

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42480

42481

APInt LHSUndef, LHSZero;

42482

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42483

Depth + 1))

42484

return true;

42485

APInt RHSUndef, RHSZero;

42486

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

42487

Depth + 1))

42488

return true;

42489

42490

// TODO - pass on known zero/undef.

42491

42492

// Aggressively peek through ops to get at the demanded elts.

42493

// TODO: Handle repeated operands.

42494

if (N0 != N1 && !DemandedElts.isAllOnes()) {

42495

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

42496

TLO.DAG, Depth + 1);

42497

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

42498

TLO.DAG, Depth + 1);

42499

if (NewN0 || NewN1) {

42500

NewN0 = NewN0 ? NewN0 : N0;

42501

NewN1 = NewN1 ? NewN1 : N1;

42502

return TLO.CombineTo(Op,

42503

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

42504

}

42505

}

42506

break;

42507

}

42508

case X86ISD::VTRUNC:

42509

case X86ISD::VTRUNCS:

42510

case X86ISD::VTRUNCUS: {

42511

SDValue Src = Op.getOperand(0);

42512

MVT SrcVT = Src.getSimpleValueType();

42513

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

42514

APInt SrcUndef, SrcZero;

42515

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

42516

Depth + 1))

42517

return true;

42518

KnownZero = SrcZero.zextOrTrunc(NumElts);

42519

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

42520

break;

42521

}

42522

case X86ISD::BLENDV: {

42523

APInt SelUndef, SelZero;

42524

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

42525

SelZero, TLO, Depth + 1))

42526

return true;

42527

42528

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

42529

APInt LHSUndef, LHSZero;

42530

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

42531

LHSZero, TLO, Depth + 1))

42532

return true;

42533

42534

APInt RHSUndef, RHSZero;

42535

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

42536

RHSZero, TLO, Depth + 1))

42537

return true;

42538

42539

KnownZero = LHSZero & RHSZero;

42540

KnownUndef = LHSUndef & RHSUndef;

42541

break;

42542

}

42543

case X86ISD::VZEXT_MOVL: {

42544

// If upper demanded elements are already zero then we have nothing to do.

42545

SDValue Src = Op.getOperand(0);

42546

APInt DemandedUpperElts = DemandedElts;

42547

DemandedUpperElts.clearLowBits(1);

42548

if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))

42549

return TLO.CombineTo(Op, Src);

42550

break;

42551

}

42552

case X86ISD::VBROADCAST: {

42553

SDValue Src = Op.getOperand(0);

42554

MVT SrcVT = Src.getSimpleValueType();

42555

if (!SrcVT.isVector())

42556

break;

42557

// Don't bother broadcasting if we just need the 0'th element.

42558

if (DemandedElts == 1) {

42559

if (Src.getValueType() != VT)

42560

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

42561

SDLoc(Op));

42562

return TLO.CombineTo(Op, Src);

42563

}

42564

APInt SrcUndef, SrcZero;

42565

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

42566

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

42567

Depth + 1))

42568

return true;

42569

// Aggressively peek through src to get at the demanded elt.

42570

// TODO - we should do this for all target/faux shuffles ops.

42571

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

42572

Src, SrcElts, TLO.DAG, Depth + 1))

42573

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

42574

break;

42575

}

42576

case X86ISD::VPERMV:

42577

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

42578

Depth))

42579

return true;

42580

break;

42581

case X86ISD::PSHUFB:

42582

case X86ISD::VPERMV3:

42583

case X86ISD::VPERMILPV:

42584

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

42585

Depth))

42586

return true;

42587

break;

42588

case X86ISD::VPPERM:

42589

case X86ISD::VPERMIL2:

42590

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

42591

Depth))

42592

return true;

42593

break;

42594

}

42595

42596

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

42597

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

42598

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

42599

if ((VT.is256BitVector() || VT.is512BitVector()) &&

42600

DemandedElts.lshr(NumElts / 2) == 0) {

42601

unsigned SizeInBits = VT.getSizeInBits();

42602

unsigned ExtSizeInBits = SizeInBits / 2;

42603

42604

// See if 512-bit ops only use the bottom 128-bits.

42605

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

42606

ExtSizeInBits = SizeInBits / 4;

42607

42608

switch (Opc) {

42609

// Scalar broadcast.

42610

case X86ISD::VBROADCAST: {

42611

SDLoc DL(Op);

42612

SDValue Src = Op.getOperand(0);

42613

if (Src.getValueSizeInBits() > ExtSizeInBits)

42614

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

42615

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

42616

ExtSizeInBits / VT.getScalarSizeInBits());

42617

SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

42618

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

42619

TLO.DAG, DL, ExtSizeInBits));

42620

}

42621

case X86ISD::VBROADCAST_LOAD: {

42622

SDLoc DL(Op);

42623

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

42624

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

42625

ExtSizeInBits / VT.getScalarSizeInBits());

42626

SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

42627

SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

42628

SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

42629

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

42630

MemIntr->getMemOperand());

42631

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

42632

Bcst.getValue(1));

42633

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

42634

TLO.DAG, DL, ExtSizeInBits));

42635

}

42636

// Subvector broadcast.

42637

case X86ISD::SUBV_BROADCAST_LOAD: {

42638

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

42639

EVT MemVT = MemIntr->getMemoryVT();

42640

if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

42641

SDLoc DL(Op);

42642

SDValue Ld =

42643

TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

42644

MemIntr->getBasePtr(), MemIntr->getMemOperand());

42645

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

42646

Ld.getValue(1));

42647

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

42648

TLO.DAG, DL, ExtSizeInBits));

42649

} else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

42650

SDLoc DL(Op);

42651

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

42652

ExtSizeInBits / VT.getScalarSizeInBits());

42653

if (SDValue BcstLd =

42654

getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

42655

return TLO.CombineTo(Op,

42656

insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

42657

TLO.DAG, DL, ExtSizeInBits));

42658

}

42659

break;

42660

}

42661

// Byte shifts by immediate.

42662

case X86ISD::VSHLDQ:

42663

case X86ISD::VSRLDQ:

42664

// Shift by uniform.

42665

case X86ISD::VSHL:

42666

case X86ISD::VSRL:

42667

case X86ISD::VSRA:

42668

// Shift by immediate.

42669

case X86ISD::VSHLI:

42670

case X86ISD::VSRLI:

42671

case X86ISD::VSRAI: {

42672

SDLoc DL(Op);

42673

SDValue Ext0 =

42674

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

42675

SDValue ExtOp =

42676

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

42677

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42678

SDValue Insert =

42679

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

42680

return TLO.CombineTo(Op, Insert);

42681

}

42682

case X86ISD::VPERMI: {

42683

// Simplify PERMPD/PERMQ to extract_subvector.

42684

// TODO: This should be done in shuffle combining.

42685

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

42686

SmallVector<int, 4> Mask;

42687

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

42688

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

42689

SDLoc DL(Op);

42690

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

42691

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42692

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

42693

return TLO.CombineTo(Op, Insert);

42694

}

42695

}

42696

break;

42697

}

42698

case X86ISD::VPERM2X128: {

42699

// Simplify VPERM2F128/VPERM2I128 to extract_subvector.

42700

SDLoc DL(Op);

42701

unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

42702

if (LoMask & 0x8)

42703

return TLO.CombineTo(

42704

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

42705

unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

42706

unsigned SrcIdx = (LoMask & 0x2) >> 1;

42707

SDValue ExtOp =

42708

extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

42709

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42710

SDValue Insert =

42711

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

42712

return TLO.CombineTo(Op, Insert);

42713

}

42714

// Zero upper elements.

42715

case X86ISD::VZEXT_MOVL:

42716

// Target unary shuffles by immediate:

42717

case X86ISD::PSHUFD:

42718

case X86ISD::PSHUFLW:

42719

case X86ISD::PSHUFHW:

42720

case X86ISD::VPERMILPI:

42721

// (Non-Lane Crossing) Target Shuffles.

42722

case X86ISD::VPERMILPV:

42723

case X86ISD::VPERMIL2:

42724

case X86ISD::PSHUFB:

42725

case X86ISD::UNPCKL:

42726

case X86ISD::UNPCKH:

42727

case X86ISD::BLENDI:

42728

// Integer ops.

42729

case X86ISD::PACKSS:

42730

case X86ISD::PACKUS:

42731

// Horizontal Ops.

42732

case X86ISD::HADD:

42733

case X86ISD::HSUB:

42734

case X86ISD::FHADD:

42735

case X86ISD::FHSUB: {

42736

SDLoc DL(Op);

42737

SmallVector<SDValue, 4> Ops;

42738

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

42739

SDValue SrcOp = Op.getOperand(i);

42740

EVT SrcVT = SrcOp.getValueType();

42741

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42742, __extension__
__PRETTY_FUNCTION__))

42742

"Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42742, __extension__
__PRETTY_FUNCTION__));

42743

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

42744

ExtSizeInBits)

42745

: SrcOp);

42746

}

42747

MVT ExtVT = VT.getSimpleVT();

42748

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

42749

ExtSizeInBits / ExtVT.getScalarSizeInBits());

42750

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

42751

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42752

SDValue Insert =

42753

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

42754

return TLO.CombineTo(Op, Insert);

42755

}

42756

}

42757

}

42758

42759

// For splats, unless we *only* demand the 0'th element,

42760

// stop attempts at simplification here, we aren't going to improve things,

42761

// this is better than any potential shuffle.

42762

if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))

42763

return false;

42764

42765

// Get target/faux shuffle mask.

42766

APInt OpUndef, OpZero;

42767

SmallVector<int, 64> OpMask;

42768

SmallVector<SDValue, 2> OpInputs;

42769

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

42770

OpZero, TLO.DAG, Depth, false))

42771

return false;

42772

42773

// Shuffle inputs must be the same size as the result.

42774

if (OpMask.size() != (unsigned)NumElts ||

42775

llvm::any_of(OpInputs, [VT](SDValue V) {

42776

return VT.getSizeInBits() != V.getValueSizeInBits() ||

42777

!V.getValueType().isVector();

42778

}))

42779

return false;

42780

42781

KnownZero = OpZero;

42782

KnownUndef = OpUndef;

42783

42784

// Check if shuffle mask can be simplified to undef/zero/identity.

42785

int NumSrcs = OpInputs.size();

42786

for (int i = 0; i != NumElts; ++i)

42787

if (!DemandedElts[i])

42788

OpMask[i] = SM_SentinelUndef;

42789

42790

if (isUndefInRange(OpMask, 0, NumElts)) {

42791

KnownUndef.setAllBits();

42792

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

42793

}

42794

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

42795

KnownZero.setAllBits();

42796

return TLO.CombineTo(

42797

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42798

}

42799

for (int Src = 0; Src != NumSrcs; ++Src)

42800

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

42801

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

42802

42803

// Attempt to simplify inputs.

42804

for (int Src = 0; Src != NumSrcs; ++Src) {

42805

// TODO: Support inputs of different types.

42806

if (OpInputs[Src].getValueType() != VT)

42807

continue;

42808

42809

int Lo = Src * NumElts;

42810

APInt SrcElts = APInt::getZero(NumElts);

42811

for (int i = 0; i != NumElts; ++i)

42812

if (DemandedElts[i]) {

42813

int M = OpMask[i] - Lo;

42814

if (0 <= M && M < NumElts)

42815

SrcElts.setBit(M);

42816

}

42817

42818

// TODO - Propagate input undef/zero elts.

42819

APInt SrcUndef, SrcZero;

42820

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

42821

TLO, Depth + 1))

42822

return true;

42823

}

42824

42825

// If we don't demand all elements, then attempt to combine to a simpler

42826

// shuffle.

42827

// We need to convert the depth to something combineX86ShufflesRecursively

42828

// can handle - so pretend its Depth == 0 again, and reduce the max depth

42829

// to match. This prevents combineX86ShuffleChain from returning a

42830

// combined shuffle that's the same as the original root, causing an

42831

// infinite loop.

42832

if (!DemandedElts.isAllOnes()) {

42833

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42833, __extension__
__PRETTY_FUNCTION__));

42834

42835

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

42836

for (int i = 0; i != NumElts; ++i)

42837

if (DemandedElts[i])

42838

DemandedMask[i] = i;

42839

42840

SDValue NewShuffle = combineX86ShufflesRecursively(

42841

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

42842

/*HasVarMask*/ false,

42843

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,

42844

Subtarget);

42845

if (NewShuffle)

42846

return TLO.CombineTo(Op, NewShuffle);

42847

}

42848

42849

return false;

42850

}

42851

42852

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

42853

SDValue Op, const APInt &OriginalDemandedBits,

42854

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

42855

unsigned Depth) const {

42856

EVT VT = Op.getValueType();

42857

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

42858

unsigned Opc = Op.getOpcode();

42859

switch(Opc) {

42860

case X86ISD::VTRUNC: {

42861

KnownBits KnownOp;

42862

SDValue Src = Op.getOperand(0);

42863

MVT SrcVT = Src.getSimpleValueType();

42864

42865

// Simplify the input, using demanded bit information.

42866

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

42867

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

42868

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

42869

return true;

42870

break;

42871

}

42872

case X86ISD::PMULDQ:

42873

case X86ISD::PMULUDQ: {

42874

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

42875

KnownBits KnownLHS, KnownRHS;

42876

SDValue LHS = Op.getOperand(0);

42877

SDValue RHS = Op.getOperand(1);

42878

42879

// Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.

42880

// FIXME: Can we bound this better?

42881

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

42882

APInt DemandedMaskLHS = APInt::getAllOnes(64);

42883

APInt DemandedMaskRHS = APInt::getAllOnes(64);

42884

42885

bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();

42886

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))

42887

DemandedMaskLHS = DemandedMask;

42888

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))

42889

DemandedMaskRHS = DemandedMask;

42890

42891

if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,

42892

KnownLHS, TLO, Depth + 1))

42893

return true;

42894

if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,

42895

KnownRHS, TLO, Depth + 1))

42896

return true;

42897

42898

// PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.

42899

KnownRHS = KnownRHS.trunc(32);

42900

if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&

42901

KnownRHS.getConstant().isOne()) {

42902

SDLoc DL(Op);

42903

SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);

42904

return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));

42905

}

42906

42907

// Aggressively peek through ops to get at the demanded low bits.

42908

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

42909

LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

42910

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

42911

RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

42912

if (DemandedLHS || DemandedRHS) {

42913

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

42914

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

42915

return TLO.CombineTo(

42916

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

42917

}

42918

break;

42919

}

42920

case X86ISD::VSHLI: {

42921

SDValue Op0 = Op.getOperand(0);

42922

42923

unsigned ShAmt = Op.getConstantOperandVal(1);

42924

if (ShAmt >= BitWidth)

42925

break;

42926

42927

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

42928

42929

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

42930

// single shift. We can do this if the bottom bits (which are shifted

42931

// out) are never demanded.

42932

if (Op0.getOpcode() == X86ISD::VSRLI &&

42933

OriginalDemandedBits.countr_zero() >= ShAmt) {

42934

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

42935

if (Shift2Amt < BitWidth) {

42936

int Diff = ShAmt - Shift2Amt;

42937

if (Diff == 0)

42938

return TLO.CombineTo(Op, Op0.getOperand(0));

42939

42940

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

42941

SDValue NewShift = TLO.DAG.getNode(

42942

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

42943

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

42944

return TLO.CombineTo(Op, NewShift);

42945

}

42946

}

42947

42948

// If we are only demanding sign bits then we can use the shift source directly.

42949

unsigned NumSignBits =

42950

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

42951

unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();

42952

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

42953

return TLO.CombineTo(Op, Op0);

42954

42955

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

42956

TLO, Depth + 1))

42957

return true;

42958

42959

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42959, __extension__
__PRETTY_FUNCTION__));

42960

Known.Zero <<= ShAmt;

42961

Known.One <<= ShAmt;

42962

42963

// Low bits known zero.

42964

Known.Zero.setLowBits(ShAmt);

42965

return false;

42966

}

42967

case X86ISD::VSRLI: {

42968

unsigned ShAmt = Op.getConstantOperandVal(1);

42969

if (ShAmt >= BitWidth)

42970

break;

42971

42972

APInt DemandedMask = OriginalDemandedBits << ShAmt;

42973

42974

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

42975

OriginalDemandedElts, Known, TLO, Depth + 1))

42976

return true;

42977

42978

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42978, __extension__
__PRETTY_FUNCTION__));

42979

Known.Zero.lshrInPlace(ShAmt);

42980

Known.One.lshrInPlace(ShAmt);

42981

42982

// High bits known zero.

42983

Known.Zero.setHighBits(ShAmt);

42984

return false;

42985

}

42986

case X86ISD::VSRAI: {

42987

SDValue Op0 = Op.getOperand(0);

42988

SDValue Op1 = Op.getOperand(1);

42989

42990

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

42991

if (ShAmt >= BitWidth)

42992

break;

42993

42994

APInt DemandedMask = OriginalDemandedBits << ShAmt;

42995

42996

// If we just want the sign bit then we don't need to shift it.

42997

if (OriginalDemandedBits.isSignMask())

42998

return TLO.CombineTo(Op, Op0);

42999

43000

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

43001

if (Op0.getOpcode() == X86ISD::VSHLI &&

43002

Op.getOperand(1) == Op0.getOperand(1)) {

43003

SDValue Op00 = Op0.getOperand(0);

43004

unsigned NumSignBits =

43005

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

43006

if (ShAmt < NumSignBits)

43007

return TLO.CombineTo(Op, Op00);

43008

}

43009

43010

// If any of the demanded bits are produced by the sign extension, we also

43011

// demand the input sign bit.

43012

if (OriginalDemandedBits.countl_zero() < ShAmt)

43013

DemandedMask.setSignBit();

43014

43015

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43016

TLO, Depth + 1))

43017

return true;

43018

43019

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43019, __extension__
__PRETTY_FUNCTION__));

43020

Known.Zero.lshrInPlace(ShAmt);

43021

Known.One.lshrInPlace(ShAmt);

43022

43023

// If the input sign bit is known to be zero, or if none of the top bits

43024

// are demanded, turn this into an unsigned shift right.

43025

if (Known.Zero[BitWidth - ShAmt - 1] ||

43026

OriginalDemandedBits.countl_zero() >= ShAmt)

43027

return TLO.CombineTo(

43028

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

43029

43030

// High bits are known one.

43031

if (Known.One[BitWidth - ShAmt - 1])

43032

Known.One.setHighBits(ShAmt);

43033

return false;

43034

}

43035

case X86ISD::BLENDV: {

43036

SDValue Sel = Op.getOperand(0);

43037

SDValue LHS = Op.getOperand(1);

43038

SDValue RHS = Op.getOperand(2);

43039

43040

APInt SignMask = APInt::getSignMask(BitWidth);

43041

SDValue NewSel = SimplifyMultipleUseDemandedBits(

43042

Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

43043

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

43044

LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43045

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

43046

RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43047

43048

if (NewSel || NewLHS || NewRHS) {

43049

NewSel = NewSel ? NewSel : Sel;

43050

NewLHS = NewLHS ? NewLHS : LHS;

43051

NewRHS = NewRHS ? NewRHS : RHS;

43052

return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

43053

NewSel, NewLHS, NewRHS));

43054

}

43055

break;

43056

}

43057

case X86ISD::PEXTRB:

43058

case X86ISD::PEXTRW: {

43059

SDValue Vec = Op.getOperand(0);

43060

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

43061

MVT VecVT = Vec.getSimpleValueType();

43062

unsigned NumVecElts = VecVT.getVectorNumElements();

43063

43064

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

43065

unsigned Idx = CIdx->getZExtValue();

43066

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

43067

43068

// If we demand no bits from the vector then we must have demanded

43069

// bits from the implict zext - simplify to zero.

43070

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

43071

if (DemandedVecBits == 0)

43072

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43073

43074

APInt KnownUndef, KnownZero;

43075

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

43076

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

43077

KnownZero, TLO, Depth + 1))

43078

return true;

43079

43080

KnownBits KnownVec;

43081

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

43082

KnownVec, TLO, Depth + 1))

43083

return true;

43084

43085

if (SDValue V = SimplifyMultipleUseDemandedBits(

43086

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

43087

return TLO.CombineTo(

43088

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

43089

43090

Known = KnownVec.zext(BitWidth);

43091

return false;

43092

}

43093

break;

43094

}

43095

case X86ISD::PINSRB:

43096

case X86ISD::PINSRW: {

43097

SDValue Vec = Op.getOperand(0);

43098

SDValue Scl = Op.getOperand(1);

43099

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43100

MVT VecVT = Vec.getSimpleValueType();

43101

43102

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

43103

unsigned Idx = CIdx->getZExtValue();

43104

if (!OriginalDemandedElts[Idx])

43105

return TLO.CombineTo(Op, Vec);

43106

43107

KnownBits KnownVec;

43108

APInt DemandedVecElts(OriginalDemandedElts);

43109

DemandedVecElts.clearBit(Idx);

43110

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

43111

KnownVec, TLO, Depth + 1))

43112

return true;

43113

43114

KnownBits KnownScl;

43115

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

43116

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

43117

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

43118

return true;

43119

43120

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

43121

Known = KnownBits::commonBits(KnownVec, KnownScl);

43122

return false;

43123

}

43124

break;

43125

}

43126

case X86ISD::PACKSS:

43127

// PACKSS saturates to MIN/MAX integer values. So if we just want the

43128

// sign bit then we can just ask for the source operands sign bit.

43129

// TODO - add known bits handling.

43130

if (OriginalDemandedBits.isSignMask()) {

43131

APInt DemandedLHS, DemandedRHS;

43132

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

43133

43134

KnownBits KnownLHS, KnownRHS;

43135

APInt SignMask = APInt::getSignMask(BitWidth * 2);

43136

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

43137

KnownLHS, TLO, Depth + 1))

43138

return true;

43139

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

43140

KnownRHS, TLO, Depth + 1))

43141

return true;

43142

43143

// Attempt to avoid multi-use ops if we don't need anything from them.

43144

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

43145

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

43146

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

43147

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

43148

if (DemandedOp0 || DemandedOp1) {

43149

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

43150

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

43151

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

43152

}

43153

}

43154

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

43155

break;

43156

case X86ISD::VBROADCAST: {

43157

SDValue Src = Op.getOperand(0);

43158

MVT SrcVT = Src.getSimpleValueType();

43159

APInt DemandedElts = APInt::getOneBitSet(

43160

SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

43161

if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

43162

TLO, Depth + 1))

43163

return true;

43164

// If we don't need the upper bits, attempt to narrow the broadcast source.

43165

// Don't attempt this on AVX512 as it might affect broadcast folding.

43166

// TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

43167

if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

43168

OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&

43169

Src->hasOneUse()) {

43170

MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

43171

SDValue NewSrc =

43172

TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

43173

MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

43174

SDValue NewBcst =

43175

TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

43176

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

43177

}

43178

break;

43179

}

43180

case X86ISD::PCMPGT:

43181

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43182

// iff we only need the sign bit then we can use R directly.

43183

if (OriginalDemandedBits.isSignMask() &&

43184

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43185

return TLO.CombineTo(Op, Op.getOperand(1));

43186

break;

43187

case X86ISD::MOVMSK: {

43188

SDValue Src = Op.getOperand(0);

43189

MVT SrcVT = Src.getSimpleValueType();

43190

unsigned SrcBits = SrcVT.getScalarSizeInBits();

43191

unsigned NumElts = SrcVT.getVectorNumElements();

43192

43193

// If we don't need the sign bits at all just return zero.

43194

if (OriginalDemandedBits.countr_zero() >= NumElts)

43195

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43196

43197

// See if we only demand bits from the lower 128-bit vector.

43198

if (SrcVT.is256BitVector() &&

43199

OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

43200

SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

43201

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43202

}

43203

43204

// Only demand the vector elements of the sign bits we need.

43205

APInt KnownUndef, KnownZero;

43206

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

43207

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

43208

TLO, Depth + 1))

43209

return true;

43210

43211

Known.Zero = KnownZero.zext(BitWidth);

43212

Known.Zero.setHighBits(BitWidth - NumElts);

43213

43214

// MOVMSK only uses the MSB from each vector element.

43215

KnownBits KnownSrc;

43216

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

43217

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

43218

Depth + 1))

43219

return true;

43220

43221

if (KnownSrc.One[SrcBits - 1])

43222

Known.One.setLowBits(NumElts);

43223

else if (KnownSrc.Zero[SrcBits - 1])

43224

Known.Zero.setLowBits(NumElts);

43225

43226

// Attempt to avoid multi-use os if we don't need anything from it.

43227

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

43228

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

43229

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43230

return false;

43231

}

43232

case X86ISD::BEXTR:

43233

case X86ISD::BEXTRI: {

43234

SDValue Op0 = Op.getOperand(0);

43235

SDValue Op1 = Op.getOperand(1);

43236

43237

// Only bottom 16-bits of the control bits are required.

43238

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

43239

// NOTE: SimplifyDemandedBits won't do this for constants.

43240

uint64_t Val1 = Cst1->getZExtValue();

43241

uint64_t MaskedVal1 = Val1 & 0xFFFF;

43242

if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

43243

SDLoc DL(Op);

43244

return TLO.CombineTo(

43245

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

43246

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

43247

}

43248

43249

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

43250

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

43251

43252

// If the length is 0, the result is 0.

43253

if (Length == 0) {

43254

Known.setAllZero();

43255

return false;

43256

}

43257

43258

if ((Shift + Length) <= BitWidth) {

43259

APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

43260

if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

43261

return true;

43262

43263

Known = Known.extractBits(Length, Shift);

43264

Known = Known.zextOrTrunc(BitWidth);

43265

return false;

43266

}

43267

} else {

43268

assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43268, __extension__
__PRETTY_FUNCTION__));

43269

KnownBits Known1;

43270

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

43271

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

43272

return true;

43273

43274

// If the length is 0, replace with 0.

43275

KnownBits LengthBits = Known1.extractBits(8, 8);

43276

if (LengthBits.isZero())

43277

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43278

}

43279

43280

break;

43281

}

43282

case X86ISD::PDEP: {

43283

SDValue Op0 = Op.getOperand(0);

43284

SDValue Op1 = Op.getOperand(1);

43285

43286

unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();

43287

APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);

43288

43289

// If the demanded bits has leading zeroes, we don't demand those from the

43290

// mask.

43291

if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

43292

return true;

43293

43294

// The number of possible 1s in the mask determines the number of LSBs of

43295

// operand 0 used. Undemanded bits from the mask don't matter so filter

43296

// them before counting.

43297

KnownBits Known2;

43298

uint64_t Count = (~Known.Zero & LoMask).popcount();

43299

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

43300

if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

43301

return true;

43302

43303

// Zeroes are retained from the mask, but not ones.

43304

Known.One.clearAllBits();

43305

// The result will have at least as many trailing zeros as the non-mask

43306

// operand since bits can only map to the same or higher bit position.

43307

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

43308

return false;

43309

}

43310

}

43311

43312

return TargetLowering::SimplifyDemandedBitsForTargetNode(

43313

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

43314

}

43315

43316

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43317

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

43318

SelectionDAG &DAG, unsigned Depth) const {

43319

int NumElts = DemandedElts.getBitWidth();

43320

unsigned Opc = Op.getOpcode();

43321

EVT VT = Op.getValueType();

43322

43323

switch (Opc) {

43324

case X86ISD::PINSRB:

43325

case X86ISD::PINSRW: {

43326

// If we don't demand the inserted element, return the base vector.

43327

SDValue Vec = Op.getOperand(0);

43328

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43329

MVT VecVT = Vec.getSimpleValueType();

43330

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

43331

!DemandedElts[CIdx->getZExtValue()])

43332

return Vec;

43333

break;

43334

}

43335

case X86ISD::VSHLI: {

43336

// If we are only demanding sign bits then we can use the shift source

43337

// directly.

43338

SDValue Op0 = Op.getOperand(0);

43339

unsigned ShAmt = Op.getConstantOperandVal(1);

43340

unsigned BitWidth = DemandedBits.getBitWidth();

43341

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

43342

unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();

43343

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43344

return Op0;

43345

break;

43346

}

43347

case X86ISD::VSRAI:

43348

// iff we only need the sign bit then we can use the source directly.

43349

// TODO: generalize where we only demand extended signbits.

43350

if (DemandedBits.isSignMask())

43351

return Op.getOperand(0);

43352

break;

43353

case X86ISD::PCMPGT:

43354

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43355

// iff we only need the sign bit then we can use R directly.

43356

if (DemandedBits.isSignMask() &&

43357

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43358

return Op.getOperand(1);

43359

break;

43360

case X86ISD::ANDNP: {

43361

// ANDNP = (~LHS & RHS);

43362

SDValue LHS = Op.getOperand(0);

43363

SDValue RHS = Op.getOperand(1);

43364

43365

KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);

43366

KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);

43367

43368

// If all of the demanded bits are known 0 on LHS and known 0 on RHS, then

43369

// the (inverted) LHS bits cannot contribute to the result of the 'andn' in

43370

// this context, so return RHS.

43371

if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))

43372

return RHS;

43373

break;

43374

}

43375

}

43376

43377

APInt ShuffleUndef, ShuffleZero;

43378

SmallVector<int, 16> ShuffleMask;

43379

SmallVector<SDValue, 2> ShuffleOps;

43380

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

43381

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

43382

// If all the demanded elts are from one operand and are inline,

43383

// then we can use the operand directly.

43384

int NumOps = ShuffleOps.size();

43385

if (ShuffleMask.size() == (unsigned)NumElts &&

43386

llvm::all_of(ShuffleOps, [VT](SDValue V) {

43387

return VT.getSizeInBits() == V.getValueSizeInBits();

43388

})) {

43389

43390

if (DemandedElts.isSubsetOf(ShuffleUndef))

43391

return DAG.getUNDEF(VT);

43392

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

43393

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

43394

43395

// Bitmask that indicates which ops have only been accessed 'inline'.

43396

APInt IdentityOp = APInt::getAllOnes(NumOps);

43397

for (int i = 0; i != NumElts; ++i) {

43398

int M = ShuffleMask[i];

43399

if (!DemandedElts[i] || ShuffleUndef[i])

43400

continue;

43401

int OpIdx = M / NumElts;

43402

int EltIdx = M % NumElts;

43403

if (M < 0 || EltIdx != i) {

43404

IdentityOp.clearAllBits();

43405

break;

43406

}

43407

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

43408

if (IdentityOp == 0)

43409

break;

43410

}

43411

assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43412, __extension__
__PRETTY_FUNCTION__))

43412

"Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43412, __extension__
__PRETTY_FUNCTION__));

43413

43414

if (IdentityOp != 0)

43415

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);

43416

}

43417

}

43418

43419

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43420

Op, DemandedBits, DemandedElts, DAG, Depth);

43421

}

43422

43423

bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43424

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43425

bool PoisonOnly, unsigned Depth) const {

43426

unsigned EltsBits = Op.getScalarValueSizeInBits();

43427

unsigned NumElts = DemandedElts.getBitWidth();

43428

43429

// TODO: Add more target shuffles.

43430

switch (Op.getOpcode()) {

43431

case X86ISD::PSHUFD:

43432

case X86ISD::VPERMILPI: {

43433

SmallVector<int, 8> Mask;

43434

DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);

43435

43436

APInt DemandedSrcElts = APInt::getZero(NumElts);

43437

for (unsigned I = 0; I != NumElts; ++I)

43438

if (DemandedElts[I])

43439

DemandedSrcElts.setBit(Mask[I]);

43440

43441

return DAG.isGuaranteedNotToBeUndefOrPoison(

43442

Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);

43443

}

43444

}

43445

return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43446

Op, DemandedElts, DAG, PoisonOnly, Depth);

43447

}

43448

43449

bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(

43450

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43451

bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {

43452

43453

// TODO: Add more target shuffles.

43454

switch (Op.getOpcode()) {

43455

case X86ISD::PSHUFD:

43456

case X86ISD::VPERMILPI:

43457

return false;

43458

}

43459

return TargetLowering::canCreateUndefOrPoisonForTargetNode(

43460

Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

43461

}

43462

43463

bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

43464

const APInt &DemandedElts,

43465

APInt &UndefElts,

43466

const SelectionDAG &DAG,

43467

unsigned Depth) const {

43468

unsigned NumElts = DemandedElts.getBitWidth();

43469

unsigned Opc = Op.getOpcode();

43470

43471

switch (Opc) {

43472

case X86ISD::VBROADCAST:

43473

case X86ISD::VBROADCAST_LOAD:

43474

UndefElts = APInt::getZero(NumElts);

43475

return true;

43476

}

43477

43478

return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

43479

DAG, Depth);

43480

}

43481

43482

// Helper to peek through bitops/trunc/setcc to determine size of source vector.

43483

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

43484

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

43485

bool AllowTruncate) {

43486

switch (Src.getOpcode()) {

43487

case ISD::TRUNCATE:

43488

if (!AllowTruncate)

43489

return false;

43490

[[fallthrough]];

43491

case ISD::SETCC:

43492

return Src.getOperand(0).getValueSizeInBits() == Size;

43493

case ISD::AND:

43494

case ISD::XOR:

43495

case ISD::OR:

43496

return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&

43497

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);

43498

case ISD::VSELECT:

43499

return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&

43500

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&

43501

checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);

43502

case ISD::BUILD_VECTOR:

43503

return ISD::isBuildVectorAllZeros(Src.getNode());

43504

43505

}

43506

return false;

43507

}

43508

43509

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

43510

static unsigned getAltBitOpcode(unsigned Opcode) {

43511

switch(Opcode) {

43512

case ISD::AND: return X86ISD::FAND;

43513

case ISD::OR: return X86ISD::FOR;

43514

case ISD::XOR: return X86ISD::FXOR;

43515

case X86ISD::ANDNP: return X86ISD::FANDN;

43516

}

43517

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43517);

43518

}

43519

43520

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

43521

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

43522

const SDLoc &DL) {

43523

EVT SrcVT = Src.getValueType();

43524

if (SrcVT != MVT::v4i1)

43525

return SDValue();

43526

43527

switch (Src.getOpcode()) {

43528

case ISD::SETCC:

43529

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

43530

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

43531

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

43532

SDValue Op0 = Src.getOperand(0);

43533

if (ISD::isNormalLoad(Op0.getNode()))

43534

return DAG.getBitcast(MVT::v4f32, Op0);

43535

if (Op0.getOpcode() == ISD::BITCAST &&

43536

Op0.getOperand(0).getValueType() == MVT::v4f32)

43537

return Op0.getOperand(0);

43538

}

43539

break;

43540

case ISD::AND:

43541

case ISD::XOR:

43542

case ISD::OR: {

43543

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

43544

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

43545

if (Op0 && Op1)

43546

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

43547

Op1);

43548

break;

43549

}

43550

}

43551

return SDValue();

43552

}

43553

43554

// Helper to push sign extension of vXi1 SETCC result through bitops.

43555

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

43556

SDValue Src, const SDLoc &DL) {

43557

switch (Src.getOpcode()) {

43558

case ISD::SETCC:

43559

case ISD::TRUNCATE:

43560

case ISD::BUILD_VECTOR:

43561

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

43562

case ISD::AND:

43563

case ISD::XOR:

43564

case ISD::OR:

43565

return DAG.getNode(

43566

Src.getOpcode(), DL, SExtVT,

43567

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

43568

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

43569

case ISD::VSELECT:

43570

return DAG.getSelect(

43571

DL, SExtVT, Src.getOperand(0),

43572

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),

43573

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));

43574

}

43575

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43575);

43576

}

43577

43578

// Try to match patterns such as

43579

// (i16 bitcast (v16i1 x))

43580

// ->

43581

// (i16 movmsk (16i8 sext (v16i1 x)))

43582

// before the illegal vector is scalarized on subtargets that don't have legal

43583

// vxi1 types.

43584

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

43585

const SDLoc &DL,

43586

const X86Subtarget &Subtarget) {

43587

EVT SrcVT = Src.getValueType();

43588

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

43589

return SDValue();

43590

43591

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

43592

// legalization destroys the v4i32 type.

43593

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

43594

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

43595

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

43596

DAG.getBitcast(MVT::v4f32, V));

43597

return DAG.getZExtOrTrunc(V, DL, VT);

43598

}

43599

}

43600

43601

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

43602

// movmskb even with avx512. This will be better than truncating to vXi1 and

43603

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

43604

// vpcmpeqb/vpcmpgtb.

43605

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

43606

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

43607

Src.getOperand(0).getValueType() == MVT::v32i8 ||

43608

Src.getOperand(0).getValueType() == MVT::v64i8);

43609

43610

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

43611

// directly with vpmovmskb/vmovmskps/vmovmskpd.

43612

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

43613

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

43614

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

43615

EVT CmpVT = Src.getOperand(0).getValueType();

43616

EVT EltVT = CmpVT.getVectorElementType();

43617

if (CmpVT.getSizeInBits() <= 256 &&

43618

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

43619

PreferMovMsk = true;

43620

}

43621

43622

// With AVX512 vxi1 types are legal and we prefer using k-regs.

43623

// MOVMSK is supported in SSE2 or later.

43624

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

43625

return SDValue();

43626

43627

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

43628

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

43629

// v8i16 and v16i16.

43630

// For these two cases, we can shuffle the upper element bytes to a

43631

// consecutive sequence at the start of the vector and treat the results as

43632

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

43633

// for v16i16 this is not the case, because the shuffle is expensive, so we

43634

// avoid sign-extending to this type entirely.

43635

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

43636

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

43637

MVT SExtVT;

43638

bool PropagateSExt = false;

43639

switch (SrcVT.getSimpleVT().SimpleTy) {

43640

default:

43641

return SDValue();

43642

case MVT::v2i1:

43643

SExtVT = MVT::v2i64;

43644

break;

43645

case MVT::v4i1:

43646

SExtVT = MVT::v4i32;

43647

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

43648

// sign-extend to a 256-bit operation to avoid truncation.

43649

if (Subtarget.hasAVX() &&

43650

checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {

43651

SExtVT = MVT::v4i64;

43652

PropagateSExt = true;

43653

}

43654

break;

43655

case MVT::v8i1:

43656

SExtVT = MVT::v8i16;

43657

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

43658

// sign-extend to a 256-bit operation to match the compare.

43659

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

43660

// 256-bit because the shuffle is cheaper than sign extending the result of

43661

// the compare.

43662

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||

43663

checkBitcastSrcVectorSize(Src, 512, true))) {

43664

SExtVT = MVT::v8i32;

43665

PropagateSExt = true;

43666

}

43667

break;

43668

case MVT::v16i1:

43669

SExtVT = MVT::v16i8;

43670

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

43671

// it is not profitable to sign-extend to 256-bit because this will

43672

// require an extra cross-lane shuffle which is more expensive than

43673

// truncating the result of the compare to 128-bits.

43674

break;

43675

case MVT::v32i1:

43676

SExtVT = MVT::v32i8;

43677

break;

43678

case MVT::v64i1:

43679

// If we have AVX512F, but not AVX512BW and the input is truncated from

43680

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

43681

if (Subtarget.hasAVX512()) {

43682

if (Subtarget.hasBWI())

43683

return SDValue();

43684

SExtVT = MVT::v64i8;

43685

break;

43686

}

43687

// Split if this is a <64 x i8> comparison result.

43688

if (checkBitcastSrcVectorSize(Src, 512, false)) {

43689

SExtVT = MVT::v64i8;

43690

break;

43691

}

43692

return SDValue();

43693

};

43694

43695

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

43696

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

43697

43698

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

43699

V = getPMOVMSKB(DL, V, DAG, Subtarget);

43700

} else {

43701

if (SExtVT == MVT::v8i16)

43702

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

43703

DAG.getUNDEF(MVT::v8i16));

43704

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

43705

}

43706

43707

EVT IntVT =

43708

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

43709

V = DAG.getZExtOrTrunc(V, DL, IntVT);

43710

return DAG.getBitcast(VT, V);

43711

}

43712

43713

// Convert a vXi1 constant build vector to the same width scalar integer.

43714

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

43715

EVT SrcVT = Op.getValueType();

43716

assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43717, __extension__
__PRETTY_FUNCTION__))

43717

"Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43717, __extension__
__PRETTY_FUNCTION__));

43718

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43719, __extension__
__PRETTY_FUNCTION__))

43719

"Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43719, __extension__
__PRETTY_FUNCTION__));

43720

43721

APInt Imm(SrcVT.getVectorNumElements(), 0);

43722

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

43723

SDValue In = Op.getOperand(Idx);

43724

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

43725

Imm.setBit(Idx);

43726

}

43727

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

43728

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

43729

}

43730

43731

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

43732

TargetLowering::DAGCombinerInfo &DCI,

43733

const X86Subtarget &Subtarget) {

43734

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43734, __extension__
__PRETTY_FUNCTION__));

43735

43736

if (!DCI.isBeforeLegalizeOps())

43737

return SDValue();

43738

43739

// Only do this if we have k-registers.

43740

if (!Subtarget.hasAVX512())

43741

return SDValue();

43742

43743

EVT DstVT = N->getValueType(0);

43744

SDValue Op = N->getOperand(0);

43745

EVT SrcVT = Op.getValueType();

43746

43747

if (!Op.hasOneUse())

43748

return SDValue();

43749

43750

// Look for logic ops.

43751

if (Op.getOpcode() != ISD::AND &&

43752

Op.getOpcode() != ISD::OR &&

43753

Op.getOpcode() != ISD::XOR)

43754

return SDValue();

43755

43756

// Make sure we have a bitcast between mask registers and a scalar type.

43757

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

43758

DstVT.isScalarInteger()) &&

43759

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

43760

SrcVT.isScalarInteger()))

43761

return SDValue();

43762

43763

SDValue LHS = Op.getOperand(0);

43764

SDValue RHS = Op.getOperand(1);

43765

43766

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

43767

LHS.getOperand(0).getValueType() == DstVT)

43768

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

43769

DAG.getBitcast(DstVT, RHS));

43770

43771

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

43772

RHS.getOperand(0).getValueType() == DstVT)

43773

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

43774

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

43775

43776

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

43777

// Most of these have to move a constant from the scalar domain anyway.

43778

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

43779

RHS = combinevXi1ConstantToInteger(RHS, DAG);

43780

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

43781

DAG.getBitcast(DstVT, LHS), RHS);

43782

}

43783

43784

return SDValue();

43785

}

43786

43787

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

43788

const X86Subtarget &Subtarget) {

43789

SDLoc DL(BV);

43790

unsigned NumElts = BV->getNumOperands();

43791

SDValue Splat = BV->getSplatValue();

43792

43793

// Build MMX element from integer GPR or SSE float values.

43794

auto CreateMMXElement = [&](SDValue V) {

43795

if (V.isUndef())

43796

return DAG.getUNDEF(MVT::x86mmx);

43797

if (V.getValueType().isFloatingPoint()) {

43798

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

43799

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

43800

V = DAG.getBitcast(MVT::v2i64, V);

43801

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

43802

}

43803

V = DAG.getBitcast(MVT::i32, V);

43804

} else {

43805

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

43806

}

43807

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

43808

};

43809

43810

// Convert build vector ops to MMX data in the bottom elements.

43811

SmallVector<SDValue, 8> Ops;

43812

43813

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43814

43815

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

43816

if (Splat) {

43817

if (Splat.isUndef())

43818

return DAG.getUNDEF(MVT::x86mmx);

43819

43820

Splat = CreateMMXElement(Splat);

43821

43822

if (Subtarget.hasSSE1()) {

43823

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

43824

if (NumElts == 8)

43825

Splat = DAG.getNode(

43826

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

43827

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

43828

TLI.getPointerTy(DAG.getDataLayout())),

43829

Splat, Splat);

43830

43831

// Use PSHUFW to repeat 16-bit elements.

43832

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

43833

return DAG.getNode(

43834

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

43835

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

43836

TLI.getPointerTy(DAG.getDataLayout())),

43837

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

43838

}

43839

Ops.append(NumElts, Splat);

43840

} else {

43841

for (unsigned i = 0; i != NumElts; ++i)

43842

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

43843

}

43844

43845

// Use tree of PUNPCKLs to build up general MMX vector.

43846

while (Ops.size() > 1) {

43847

unsigned NumOps = Ops.size();

43848

unsigned IntrinOp =

43849

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

43850

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

43851

: Intrinsic::x86_mmx_punpcklbw));

43852

SDValue Intrin = DAG.getTargetConstant(

43853

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

43854

for (unsigned i = 0; i != NumOps; i += 2)

43855

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

43856

Ops[i], Ops[i + 1]);

43857

Ops.resize(NumOps / 2);

43858

}

43859

43860

return Ops[0];

43861

}

43862

43863

// Recursive function that attempts to find if a bool vector node was originally

43864

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

43865

// integer. If so, replace the scalar ops with bool vector equivalents back down

43866

// the chain.

43867

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

43868

SelectionDAG &DAG,

43869

const X86Subtarget &Subtarget) {

43870

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43871

unsigned Opc = V.getOpcode();

43872

switch (Opc) {

43873

case ISD::BITCAST: {

43874

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

43875

SDValue Src = V.getOperand(0);

43876

EVT SrcVT = Src.getValueType();

43877

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

43878

return DAG.getBitcast(VT, Src);

43879

break;

43880

}

43881

case ISD::TRUNCATE: {

43882

// If we find a suitable source, a truncated scalar becomes a subvector.

43883

SDValue Src = V.getOperand(0);

43884

EVT NewSrcVT =

43885

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

43886

if (TLI.isTypeLegal(NewSrcVT))

43887

if (SDValue N0 =

43888

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

43889

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

43890

DAG.getIntPtrConstant(0, DL));

43891

break;

43892

}

43893

case ISD::ANY_EXTEND:

43894

case ISD::ZERO_EXTEND: {

43895

// If we find a suitable source, an extended scalar becomes a subvector.

43896

SDValue Src = V.getOperand(0);

43897

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

43898

Src.getScalarValueSizeInBits());

43899

if (TLI.isTypeLegal(NewSrcVT))

43900

if (SDValue N0 =

43901

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

43902

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

43903

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

43904

: DAG.getConstant(0, DL, VT),

43905

N0, DAG.getIntPtrConstant(0, DL));

43906

break;

43907

}

43908

case ISD::OR: {

43909

// If we find suitable sources, we can just move an OR to the vector domain.

43910

SDValue Src0 = V.getOperand(0);

43911

SDValue Src1 = V.getOperand(1);

43912

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

43913

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

43914

return DAG.getNode(Opc, DL, VT, N0, N1);

43915

break;

43916

}

43917

case ISD::SHL: {

43918

// If we find a suitable source, a SHL becomes a KSHIFTL.

43919

SDValue Src0 = V.getOperand(0);

43920

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

43921

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

43922

break;

43923

43924

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

43925

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

43926

return DAG.getNode(

43927

X86ISD::KSHIFTL, DL, VT, N0,

43928

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

43929

break;

43930

}

43931

}

43932

return SDValue();

43933

}

43934

43935

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

43936

TargetLowering::DAGCombinerInfo &DCI,

43937

const X86Subtarget &Subtarget) {

43938

SDValue N0 = N->getOperand(0);

43939

EVT VT = N->getValueType(0);

43940

EVT SrcVT = N0.getValueType();

43941

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43942

43943

// Try to match patterns such as

43944

// (i16 bitcast (v16i1 x))

43945

// ->

43946

// (i16 movmsk (16i8 sext (v16i1 x)))

43947

// before the setcc result is scalarized on subtargets that don't have legal

43948

// vxi1 types.

43949

if (DCI.isBeforeLegalize()) {

43950

SDLoc dl(N);

43951

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

43952

return V;

43953

43954

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

43955

// type, widen both sides to avoid a trip through memory.

43956

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

43957

Subtarget.hasAVX512()) {

43958

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

43959

N0 = DAG.getBitcast(MVT::v8i1, N0);

43960

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

43961

DAG.getIntPtrConstant(0, dl));

43962

}

43963

43964

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

43965

// type, widen both sides to avoid a trip through memory.

43966

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

43967

Subtarget.hasAVX512()) {

43968

// Use zeros for the widening if we already have some zeroes. This can

43969

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

43970

// stream of this.

43971

// FIXME: It might make sense to detect a concat_vectors with a mix of

43972

// zeroes and undef and turn it into insert_subvector for i1 vectors as

43973

// a separate combine. What we can't do is canonicalize the operands of

43974

// such a concat or we'll get into a loop with SimplifyDemandedBits.

43975

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

43976

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

43977

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

43978

SrcVT = LastOp.getValueType();

43979

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

43980

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

43981

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

43982

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

43983

N0 = DAG.getBitcast(MVT::i8, N0);

43984

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

43985

}

43986

}

43987

43988

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

43989

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

43990

Ops[0] = N0;

43991

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

43992

N0 = DAG.getBitcast(MVT::i8, N0);

43993

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

43994

}

43995

} else {

43996

// If we're bitcasting from iX to vXi1, see if the integer originally

43997

// began as a vXi1 and whether we can remove the bitcast entirely.

43998

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

43999

SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

44000

if (SDValue V =

44001

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

44002

return V;

44003

}

44004

}

44005

44006

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

44007

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

44008

// due to insert_subvector legalization on KNL. By promoting the copy to i16

44009

// we can help with known bits propagation from the vXi1 domain to the

44010

// scalar domain.

44011

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

44012

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

44013

N0.getOperand(0).getValueType() == MVT::v16i1 &&

44014

isNullConstant(N0.getOperand(1)))

44015

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

44016

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

44017

44018

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

44019

// and the vbroadcast_load are both integer or both fp. In some cases this

44020

// will remove the bitcast entirely.

44021

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

44022

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

44023

auto *BCast = cast<MemIntrinsicSDNode>(N0);

44024

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

44025

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

44026

// Don't swap i8/i16 since don't have fp types that size.

44027

if (MemSize >= 32) {

44028

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

44029

: MVT::getIntegerVT(MemSize);

44030

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

44031

: MVT::getIntegerVT(SrcVTSize);

44032

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

44033

44034

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

44035

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

44036

SDValue ResNode =

44037

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

44038

MemVT, BCast->getMemOperand());

44039

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

44040

return DAG.getBitcast(VT, ResNode);

44041

}

44042

}

44043

44044

// Since MMX types are special and don't usually play with other vector types,

44045

// it's better to handle them early to be sure we emit efficient code by

44046

// avoiding store-load conversions.

44047

if (VT == MVT::x86mmx) {

44048

// Detect MMX constant vectors.

44049

APInt UndefElts;

44050

SmallVector<APInt, 1> EltBits;

44051

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

44052

SDLoc DL(N0);

44053

// Handle zero-extension of i32 with MOVD.

44054

if (EltBits[0].countl_zero() >= 32)

44055

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

44056

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

44057

// Else, bitcast to a double.

44058

// TODO - investigate supporting sext 32-bit immediates on x86_64.

44059

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

44060

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

44061

}

44062

44063

// Detect bitcasts to x86mmx low word.

44064

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44065

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

44066

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

44067

bool LowUndef = true, AllUndefOrZero = true;

44068

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

44069

SDValue Op = N0.getOperand(i);

44070

LowUndef &= Op.isUndef() || (i >= e/2);

44071

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

44072

}

44073

if (AllUndefOrZero) {

44074

SDValue N00 = N0.getOperand(0);

44075

SDLoc dl(N00);

44076

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

44077

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

44078

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

44079

}

44080

}

44081

44082

// Detect bitcasts of 64-bit build vectors and convert to a

44083

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

44084

// lowest element.

44085

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44086

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

44087

SrcVT == MVT::v8i8))

44088

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

44089

44090

// Detect bitcasts between element or subvector extraction to x86mmx.

44091

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

44092

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

44093

isNullConstant(N0.getOperand(1))) {

44094

SDValue N00 = N0.getOperand(0);

44095

if (N00.getValueType().is128BitVector())

44096

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

44097

DAG.getBitcast(MVT::v2i64, N00));

44098

}

44099

44100

// Detect bitcasts from FP_TO_SINT to x86mmx.

44101

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

44102

SDLoc DL(N0);

44103

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

44104

DAG.getUNDEF(MVT::v2i32));

44105

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

44106

DAG.getBitcast(MVT::v2i64, Res));

44107

}

44108

}

44109

44110

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

44111

// most of these to scalar anyway.

44112

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

44113

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44114

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

44115

return combinevXi1ConstantToInteger(N0, DAG);

44116

}

44117

44118

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44119

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44120

isa<ConstantSDNode>(N0)) {

44121

auto *C = cast<ConstantSDNode>(N0);

44122

if (C->isAllOnes())

44123

return DAG.getConstant(1, SDLoc(N0), VT);

44124

if (C->isZero())

44125

return DAG.getConstant(0, SDLoc(N0), VT);

44126

}

44127

44128

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

44129

// Turn it into a sign bit compare that produces a k-register. This avoids

44130

// a trip through a GPR.

44131

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44132

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44133

isPowerOf2_32(VT.getVectorNumElements())) {

44134

unsigned NumElts = VT.getVectorNumElements();

44135

SDValue Src = N0;

44136

44137

// Peek through truncate.

44138

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

44139

Src = N0.getOperand(0);

44140

44141

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

44142

SDValue MovmskIn = Src.getOperand(0);

44143

MVT MovmskVT = MovmskIn.getSimpleValueType();

44144

unsigned MovMskElts = MovmskVT.getVectorNumElements();

44145

44146

// We allow extra bits of the movmsk to be used since they are known zero.

44147

// We can't convert a VPMOVMSKB without avx512bw.

44148

if (MovMskElts <= NumElts &&

44149

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

44150

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

44151

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

44152

SDLoc dl(N);

44153

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

44154

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

44155

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

44156

if (EVT(CmpVT) == VT)

44157

return Cmp;

44158

44159

// Pad with zeroes up to original VT to replace the zeroes that were

44160

// being used from the MOVMSK.

44161

unsigned NumConcats = NumElts / MovMskElts;

44162

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

44163

Ops[0] = Cmp;

44164

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

44165

}

44166

}

44167

}

44168

44169

// Try to remove bitcasts from input and output of mask arithmetic to

44170

// remove GPR<->K-register crossings.

44171

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

44172

return V;

44173

44174

// Convert a bitcasted integer logic operation that has one bitcasted

44175

// floating-point operand into a floating-point logic operation. This may

44176

// create a load of a constant, but that is cheaper than materializing the

44177

// constant in an integer register and transferring it to an SSE register or

44178

// transferring the SSE operand to integer register and back.

44179

unsigned FPOpcode;

44180

switch (N0.getOpcode()) {

44181

case ISD::AND: FPOpcode = X86ISD::FAND; break;

44182

case ISD::OR: FPOpcode = X86ISD::FOR; break;

44183

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

44184

default: return SDValue();

44185

}

44186

44187

// Check if we have a bitcast from another integer type as well.

44188

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

44189

(Subtarget.hasSSE2() && VT == MVT::f64) ||

44190

(Subtarget.hasFP16() && VT == MVT::f16) ||

44191

(Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

44192

TLI.isTypeLegal(VT))))

44193

return SDValue();

44194

44195

SDValue LogicOp0 = N0.getOperand(0);

44196

SDValue LogicOp1 = N0.getOperand(1);

44197

SDLoc DL0(N0);

44198

44199

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

44200

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

44201

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

44202

LogicOp0.getOperand(0).getValueType() == VT &&

44203

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

44204

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

44205

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44206

return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

44207

}

44208

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

44209

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

44210

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

44211

LogicOp1.getOperand(0).getValueType() == VT &&

44212

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

44213

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

44214

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44215

return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

44216

}

44217

44218

return SDValue();

44219

}

44220

44221

// (mul (zext a), (sext, b))

44222

static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

44223

SDValue &Op1) {

44224

Op0 = Mul.getOperand(0);

44225

Op1 = Mul.getOperand(1);

44226

44227

// The operand1 should be signed extend

44228

if (Op0.getOpcode() == ISD::SIGN_EXTEND)

44229

std::swap(Op0, Op1);

44230

44231

auto IsFreeTruncation = [](SDValue &Op) -> bool {

44232

if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

44233

Op.getOpcode() == ISD::SIGN_EXTEND) &&

44234

Op.getOperand(0).getScalarValueSizeInBits() <= 8)

44235

return true;

44236

44237

auto *BV = dyn_cast<BuildVectorSDNode>(Op);

44238

return (BV && BV->isConstant());

44239

};

44240

44241

// (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

44242

// value, we need to check Op0 is zero extended value. Op1 should be signed

44243

// value, so we just check the signed bits.

44244

if ((IsFreeTruncation(Op0) &&

44245

DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

44246

(IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

44247

return true;

44248

44249

return false;

44250

}

44251

44252

// Given a ABS node, detect the following pattern:

44253

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

44254

// This is useful as it is the input into a SAD pattern.

44255

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

44256

SDValue AbsOp1 = Abs->getOperand(0);

44257

if (AbsOp1.getOpcode() != ISD::SUB)

44258

return false;

44259

44260

Op0 = AbsOp1.getOperand(0);

44261

Op1 = AbsOp1.getOperand(1);

44262

44263

// Check if the operands of the sub are zero-extended from vectors of i8.

44264

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

44265

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

44266

Op1.getOpcode() != ISD::ZERO_EXTEND ||

44267

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

44268

return false;

44269

44270

return true;

44271

}

44272

44273

static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

44274

unsigned &LogBias, const SDLoc &DL,

44275

const X86Subtarget &Subtarget) {

44276

// Extend or truncate to MVT::i8 first.

44277

MVT Vi8VT =

44278

MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

44279

LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

44280

RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);

44281

44282

// VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

44283

// C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

44284

// The src A, B element type is i8, but the dst C element type is i32.

44285

// When we calculate the reduce stage, we use src vector type vXi8 for it

44286

// so we need logbias 2 to avoid extra 2 stages.

44287

LogBias = 2;

44288

44289

unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

44290

if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

44291

RegSize = std::max(512u, RegSize);

44292

44293

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44294

// fill in the missing vector elements with 0.

44295

unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

44296

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

44297

Ops[0] = LHS;

44298

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44299

SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44300

Ops[0] = RHS;

44301

SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44302

44303

// Actually build the DotProduct, split as 256/512 bits for

44304

// AVXVNNI/AVX512VNNI.

44305

auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44306

ArrayRef<SDValue> Ops) {

44307

MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

44308

return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

44309

};

44310

MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

44311

SDValue Zero = DAG.getConstant(0, DL, DpVT);

44312

44313

return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

44314

DpBuilder, false);

44315

}

44316

44317

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

44318

// to these zexts.

44319

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

44320

const SDValue &Zext1, const SDLoc &DL,

44321

const X86Subtarget &Subtarget) {

44322

// Find the appropriate width for the PSADBW.

44323

EVT InVT = Zext0.getOperand(0).getValueType();

44324

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

44325

44326

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44327

// fill in the missing vector elements with 0.

44328

unsigned NumConcat = RegSize / InVT.getSizeInBits();

44329

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

44330

Ops[0] = Zext0.getOperand(0);

44331

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44332

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44333

Ops[0] = Zext1.getOperand(0);

44334

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44335

44336

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

44337

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44338

ArrayRef<SDValue> Ops) {

44339

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

44340

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

44341

};

44342

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

44343

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

44344

PSADBWBuilder);

44345

}

44346

44347

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

44348

// PHMINPOSUW.

44349

static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

44350

const X86Subtarget &Subtarget) {

44351

// Bail without SSE41.

44352

if (!Subtarget.hasSSE41())

44353

return SDValue();

44354

44355

EVT ExtractVT = Extract->getValueType(0);

44356

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

44357

return SDValue();

44358

44359

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

44360

ISD::NodeType BinOp;

44361

SDValue Src = DAG.matchBinOpReduction(

44362

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

44363

if (!Src)

44364

return SDValue();

44365

44366

EVT SrcVT = Src.getValueType();

44367

EVT SrcSVT = SrcVT.getScalarType();

44368

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

44369

return SDValue();

44370

44371

SDLoc DL(Extract);

44372

SDValue MinPos = Src;

44373

44374

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

44375

while (SrcVT.getSizeInBits() > 128) {

44376

SDValue Lo, Hi;

44377

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

44378

SrcVT = Lo.getValueType();

44379

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

44380

}

44381

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44383, __extension__
__PRETTY_FUNCTION__))

44382

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44383, __extension__
__PRETTY_FUNCTION__))

44383

"Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44383, __extension__
__PRETTY_FUNCTION__));

44384

44385

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

44386

// to flip the value accordingly.

44387

SDValue Mask;

44388

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

44389

if (BinOp == ISD::SMAX)

44390

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

44391

else if (BinOp == ISD::SMIN)

44392

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

44393

else if (BinOp == ISD::UMAX)

44394

Mask = DAG.getAllOnesConstant(DL, SrcVT);

44395

44396

if (Mask)

44397

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44398

44399

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

44400

// shuffling each upper element down and insert zeros. This means that the

44401

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

44402

// ready for the PHMINPOS.

44403

if (ExtractVT == MVT::i8) {

44404

SDValue Upper = DAG.getVectorShuffle(

44405

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

44406

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

44407

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

44408

}

44409

44410

// Perform the PHMINPOS on a v8i16 vector,

44411

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

44412

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

44413

MinPos = DAG.getBitcast(SrcVT, MinPos);

44414

44415

if (Mask)

44416

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44417

44418

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

44419

DAG.getIntPtrConstant(0, DL));

44420

}

44421

44422

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

44423

static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

44424

const X86Subtarget &Subtarget) {

44425

// Bail without SSE2.

44426

if (!Subtarget.hasSSE2())

44427

return SDValue();

44428

44429

EVT ExtractVT = Extract->getValueType(0);

44430

unsigned BitWidth = ExtractVT.getSizeInBits();

44431

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

44432

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

44433

return SDValue();

44434

44435

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

44436

ISD::NodeType BinOp;

44437

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

44438

if (!Match && ExtractVT == MVT::i1)

44439

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

44440

if (!Match)

44441

return SDValue();

44442

44443

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

44444

// which we can't support here for now.

44445

if (Match.getScalarValueSizeInBits() != BitWidth)

44446

return SDValue();

44447

44448

SDValue Movmsk;

44449

SDLoc DL(Extract);

44450

EVT MatchVT = Match.getValueType();

44451

unsigned NumElts = MatchVT.getVectorNumElements();

44452

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

44453

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44454

LLVMContext &Ctx = *DAG.getContext();

44455

44456

if (ExtractVT == MVT::i1) {

44457

// Special case for (pre-legalization) vXi1 reductions.

44458

if (NumElts > 64 || !isPowerOf2_32(NumElts))

44459

return SDValue();

44460

if (TLI.isTypeLegal(MatchVT)) {

44461

// If this is a legal AVX512 predicate type then we can just bitcast.

44462

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

44463

Movmsk = DAG.getBitcast(MovmskVT, Match);

44464

} else {

44465

// For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).

44466

// For any_of(setcc(x,y,ne)) - use PMOVMSKB(NOT(PCMPEQB())).

44467

if (Match.getOpcode() == ISD::SETCC) {

44468

ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();

44469

if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||

44470

(BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {

44471

EVT VecVT = Match.getOperand(0).getValueType();

44472

44473

// If representable as a scalar integer:

44474

// For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.

44475

// For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.

44476

EVT IntVT = EVT::getIntegerVT(Ctx, VecVT.getSizeInBits());

44477

if (TLI.isTypeLegal(IntVT)) {

44478

SDValue LHS = DAG.getFreeze(Match.getOperand(0));

44479

SDValue RHS = DAG.getFreeze(Match.getOperand(1));

44480

return DAG.getSetCC(DL, ExtractVT, DAG.getBitcast(IntVT, LHS),

44481

DAG.getBitcast(IntVT, RHS), CC);

44482

}

44483

44484

EVT VecSVT = VecVT.getScalarType();

44485

if (VecSVT != MVT::i8 && (VecSVT.getSizeInBits() % 8) == 0) {

44486

NumElts *= VecSVT.getSizeInBits() / 8;

44487

EVT CmpVT = EVT::getVectorVT(Ctx, MVT::i8, NumElts);

44488

MatchVT = EVT::getVectorVT(Ctx, MVT::i1, NumElts);

44489

Match = DAG.getSetCC(

44490

DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),

44491

DAG.getBitcast(CmpVT, Match.getOperand(1)), CC);

44492

}

44493

}

44494

}

44495

44496

// Use combineBitcastvxi1 to create the MOVMSK.

44497

while (NumElts > MaxElts) {

44498

SDValue Lo, Hi;

44499

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

44500

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

44501

NumElts /= 2;

44502

}

44503

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

44504

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

44505

}

44506

if (!Movmsk)

44507

return SDValue();

44508

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

44509

} else {

44510

// FIXME: Better handling of k-registers or 512-bit vectors?

44511

unsigned MatchSizeInBits = Match.getValueSizeInBits();

44512

if (!(MatchSizeInBits == 128 ||

44513

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

44514

return SDValue();

44515

44516

// Make sure this isn't a vector of 1 element. The perf win from using

44517

// MOVMSK diminishes with less elements in the reduction, but it is

44518

// generally better to get the comparison over to the GPRs as soon as

44519

// possible to reduce the number of vector ops.

44520

if (Match.getValueType().getVectorNumElements() < 2)

44521

return SDValue();

44522

44523

// Check that we are extracting a reduction of all sign bits.

44524

if (DAG.ComputeNumSignBits(Match) != BitWidth)

44525

return SDValue();

44526

44527

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

44528

SDValue Lo, Hi;

44529

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

44530

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

44531

MatchSizeInBits = Match.getValueSizeInBits();

44532

}

44533

44534

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

44535

MVT MaskSrcVT;

44536

if (64 == BitWidth || 32 == BitWidth)

44537

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

44538

MatchSizeInBits / BitWidth);

44539

else

44540

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

44541

44542

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

44543

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

44544

NumElts = MaskSrcVT.getVectorNumElements();

44545

}

44546

assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44547, __extension__
__PRETTY_FUNCTION__))

44547

"Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44547, __extension__
__PRETTY_FUNCTION__));

44548

44549

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

44550

if (BinOp == ISD::XOR) {

44551

// parity -> (PARITY(MOVMSK X))

44552

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

44553

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

44554

}

44555

44556

SDValue CmpC;

44557

ISD::CondCode CondCode;

44558

if (BinOp == ISD::OR) {

44559

// any_of -> MOVMSK != 0

44560

CmpC = DAG.getConstant(0, DL, CmpVT);

44561

CondCode = ISD::CondCode::SETNE;

44562

} else {

44563

// all_of -> MOVMSK == ((1 << NumElts) - 1)

44564

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

44565

DL, CmpVT);

44566

CondCode = ISD::CondCode::SETEQ;

44567

}

44568

44569

// The setcc produces an i8 of 0/1, so extend that to the result width and

44570

// negate to get the final 0/-1 mask value.

44571

EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);

44572

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

44573

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

44574

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

44575

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

44576

}

44577

44578

static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

44579

const X86Subtarget &Subtarget) {

44580

if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

44581

return SDValue();

44582

44583

EVT ExtractVT = Extract->getValueType(0);

44584

// Verify the type we're extracting is i32, as the output element type of

44585

// vpdpbusd is i32.

44586

if (ExtractVT != MVT::i32)

44587

return SDValue();

44588

44589

EVT VT = Extract->getOperand(0).getValueType();

44590

if (!isPowerOf2_32(VT.getVectorNumElements()))

44591

return SDValue();

44592

44593

// Match shuffle + add pyramid.

44594

ISD::NodeType BinOp;

44595

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

44596

44597

// We can't combine to vpdpbusd for zext, because each of the 4 multiplies

44598

// done by vpdpbusd compute a signed 16-bit product that will be sign extended

44599

// before adding into the accumulator.

44600

// TODO:

44601

// We also need to verify that the multiply has at least 2x the number of bits

44602

// of the input. We shouldn't match

44603

// (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

44604

// if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

44605

// Root = Root.getOperand(0);

44606

44607

// If there was a match, we want Root to be a mul.

44608

if (!Root || Root.getOpcode() != ISD::MUL)

44609

return SDValue();

44610

44611

// Check whether we have an extend and mul pattern

44612

SDValue LHS, RHS;

44613

if (!detectExtMul(DAG, Root, LHS, RHS))

44614

return SDValue();

44615

44616

// Create the dot product instruction.

44617

SDLoc DL(Extract);

44618

unsigned StageBias;

44619

SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);

44620

44621

// If the original vector was wider than 4 elements, sum over the results

44622

// in the DP vector.

44623

unsigned Stages = Log2_32(VT.getVectorNumElements());

44624

EVT DpVT = DP.getValueType();

44625

44626

if (Stages > StageBias) {

44627

unsigned DpElems = DpVT.getVectorNumElements();

44628

44629

for (unsigned i = Stages - StageBias; i > 0; --i) {

44630

SmallVector<int, 16> Mask(DpElems, -1);

44631

for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

44632

Mask[j] = MaskEnd + j;

44633

44634

SDValue Shuffle =

44635

DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

44636

DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

44637

}

44638

}

44639

44640

// Return the lowest ExtractSizeInBits bits.

44641

EVT ResVT =

44642

EVT::getVectorVT(*DAG.getContext(), ExtractVT,

44643

DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

44644

DP = DAG.getBitcast(ResVT, DP);

44645

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

44646

Extract->getOperand(1));

44647

}

44648

44649

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

44650

const X86Subtarget &Subtarget) {

44651

// PSADBW is only supported on SSE2 and up.

44652

if (!Subtarget.hasSSE2())

44653

return SDValue();

44654

44655

EVT ExtractVT = Extract->getValueType(0);

44656

// Verify the type we're extracting is either i32 or i64.

44657

// FIXME: Could support other types, but this is what we have coverage for.

44658

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

44659

return SDValue();

44660

44661

EVT VT = Extract->getOperand(0).getValueType();

44662

if (!isPowerOf2_32(VT.getVectorNumElements()))

44663

return SDValue();

44664

44665

// Match shuffle + add pyramid.

44666

ISD::NodeType BinOp;

44667

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

44668

44669

// The operand is expected to be zero extended from i8

44670

// (verified in detectZextAbsDiff).

44671

// In order to convert to i64 and above, additional any/zero/sign

44672

// extend is expected.

44673

// The zero extend from 32 bit has no mathematical effect on the result.

44674

// Also the sign extend is basically zero extend

44675

// (extends the sign bit which is zero).

44676

// So it is correct to skip the sign/zero extend instruction.

44677

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

44678

Root.getOpcode() == ISD::ZERO_EXTEND ||

44679

Root.getOpcode() == ISD::ANY_EXTEND))

44680

Root = Root.getOperand(0);

44681

44682

// If there was a match, we want Root to be a select that is the root of an

44683

// abs-diff pattern.

44684

if (!Root || Root.getOpcode() != ISD::ABS)

44685

return SDValue();

44686

44687

// Check whether we have an abs-diff pattern feeding into the select.

44688

SDValue Zext0, Zext1;

44689

if (!detectZextAbsDiff(Root, Zext0, Zext1))

44690

return SDValue();

44691

44692

// Create the SAD instruction.

44693

SDLoc DL(Extract);

44694

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

44695

44696

// If the original vector was wider than 8 elements, sum over the results

44697

// in the SAD vector.

44698

unsigned Stages = Log2_32(VT.getVectorNumElements());

44699

EVT SadVT = SAD.getValueType();

44700

if (Stages > 3) {

44701

unsigned SadElems = SadVT.getVectorNumElements();

44702

44703

for(unsigned i = Stages - 3; i > 0; --i) {

44704

SmallVector<int, 16> Mask(SadElems, -1);

44705

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

44706

Mask[j] = MaskEnd + j;

44707

44708

SDValue Shuffle =

44709

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

44710

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

44711

}

44712

}

44713

44714

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

44715

// Return the lowest ExtractSizeInBits bits.

44716

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

44717

SadVT.getSizeInBits() / ExtractSizeInBits);

44718

SAD = DAG.getBitcast(ResVT, SAD);

44719

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

44720

Extract->getOperand(1));

44721

}

44722

44723

// Attempt to peek through a target shuffle and extract the scalar from the

44724

// source.

44725

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

44726

TargetLowering::DAGCombinerInfo &DCI,

44727

const X86Subtarget &Subtarget) {

44728

if (DCI.isBeforeLegalizeOps())

44729

return SDValue();

44730

44731

SDLoc dl(N);

44732

SDValue Src = N->getOperand(0);

44733

SDValue Idx = N->getOperand(1);

44734

44735

EVT VT = N->getValueType(0);

44736

EVT SrcVT = Src.getValueType();

44737

EVT SrcSVT = SrcVT.getVectorElementType();

44738

unsigned SrcEltBits = SrcSVT.getSizeInBits();

44739

unsigned NumSrcElts = SrcVT.getVectorNumElements();

44740

44741

// Don't attempt this for boolean mask vectors or unknown extraction indices.

44742

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

44743

return SDValue();

44744

44745

const APInt &IdxC = N->getConstantOperandAPInt(1);

44746

if (IdxC.uge(NumSrcElts))

44747

return SDValue();

44748

44749

SDValue SrcBC = peekThroughBitcasts(Src);

44750

44751

// Handle extract(bitcast(broadcast(scalar_value))).

44752

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

44753

SDValue SrcOp = SrcBC.getOperand(0);

44754

EVT SrcOpVT = SrcOp.getValueType();

44755

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

44756

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

44757

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

44758

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

44759

// TODO support non-zero offsets.

44760

if (Offset == 0) {

44761

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

44762

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

44763

return SrcOp;

44764

}

44765

}

44766

}

44767

44768

// If we're extracting a single element from a broadcast load and there are

44769

// no other users, just create a single load.

44770

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

44771

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

44772

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

44773

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

44774

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

44775

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

44776

MemIntr->getBasePtr(),

44777

MemIntr->getPointerInfo(),

44778

MemIntr->getOriginalAlign(),

44779

MemIntr->getMemOperand()->getFlags());

44780

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

44781

return Load;

44782

}

44783

}

44784

44785

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

44786

// TODO: Move to DAGCombine?

44787

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

44788

SrcBC.getValueType().isInteger() &&

44789

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

44790

SrcBC.getScalarValueSizeInBits() ==

44791

SrcBC.getOperand(0).getValueSizeInBits()) {

44792

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

44793

if (IdxC.ult(Scale)) {

44794

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

44795

SDValue Scl = SrcBC.getOperand(0);

44796

EVT SclVT = Scl.getValueType();

44797

if (Offset) {

44798

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

44799

DAG.getShiftAmountConstant(Offset, SclVT, dl));

44800

}

44801

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

44802

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

44803

return Scl;

44804

}

44805

}

44806

44807

// Handle extract(truncate(x)) for 0'th index.

44808

// TODO: Treat this as a faux shuffle?

44809

// TODO: When can we use this for general indices?

44810

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

44811

(SrcVT.getSizeInBits() % 128) == 0) {

44812

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

44813

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

44814

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

44815

Idx);

44816

}

44817

44818

// We can only legally extract other elements from 128-bit vectors and in

44819

// certain circumstances, depending on SSE-level.

44820

// TODO: Investigate float/double extraction if it will be just stored.

44821

auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

44822

unsigned Idx) {

44823

EVT VecSVT = VecVT.getScalarType();

44824

if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

44825

(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

44826

VecSVT == MVT::i64)) {

44827

unsigned EltSizeInBits = VecSVT.getSizeInBits();

44828

unsigned NumEltsPerLane = 128 / EltSizeInBits;

44829

unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

44830

unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

44831

VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

44832

Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

44833

Idx &= (NumEltsPerLane - 1);

44834

}

44835

if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

44836

((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

44837

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

44838

DAG.getBitcast(VecVT, Vec),

44839

DAG.getIntPtrConstant(Idx, dl));

44840

}

44841

if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

44842

(VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

44843

unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

44844

return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

44845

DAG.getTargetConstant(Idx, dl, MVT::i8));

44846

}

44847

return SDValue();

44848

};

44849

44850

// Resolve the target shuffle inputs and mask.

44851

SmallVector<int, 16> Mask;

44852

SmallVector<SDValue, 2> Ops;

44853

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

44854

return SDValue();

44855

44856

// Shuffle inputs must be the same size as the result.

44857

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

44858

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

44859

}))

44860

return SDValue();

44861

44862

// Attempt to narrow/widen the shuffle mask to the correct size.

44863

if (Mask.size() != NumSrcElts) {

44864

if ((NumSrcElts % Mask.size()) == 0) {

44865

SmallVector<int, 16> ScaledMask;

44866

int Scale = NumSrcElts / Mask.size();

44867

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

44868

Mask = std::move(ScaledMask);

44869

} else if ((Mask.size() % NumSrcElts) == 0) {

44870

// Simplify Mask based on demanded element.

44871

int ExtractIdx = (int)IdxC.getZExtValue();

44872

int Scale = Mask.size() / NumSrcElts;

44873

int Lo = Scale * ExtractIdx;

44874

int Hi = Scale * (ExtractIdx + 1);

44875

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

44876

if (i < Lo || Hi <= i)

44877

Mask[i] = SM_SentinelUndef;

44878

44879

SmallVector<int, 16> WidenedMask;

44880

while (Mask.size() > NumSrcElts &&

44881

canWidenShuffleElements(Mask, WidenedMask))

44882

Mask = std::move(WidenedMask);

44883

}

44884

}

44885

44886

// If narrowing/widening failed, see if we can extract+zero-extend.

44887

int ExtractIdx;

44888

EVT ExtractVT;

44889

if (Mask.size() == NumSrcElts) {

44890

ExtractIdx = Mask[IdxC.getZExtValue()];

44891

ExtractVT = SrcVT;

44892

} else {

44893

unsigned Scale = Mask.size() / NumSrcElts;

44894

if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

44895

return SDValue();

44896

unsigned ScaledIdx = Scale * IdxC.getZExtValue();

44897

if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

44898

return SDValue();

44899

ExtractIdx = Mask[ScaledIdx];

44900

EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

44901

ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

44902

assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44903, __extension__
__PRETTY_FUNCTION__))

44903

"Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44903, __extension__
__PRETTY_FUNCTION__));

44904

}

44905

44906

// If the shuffle source element is undef/zero then we can just accept it.

44907

if (ExtractIdx == SM_SentinelUndef)

44908

return DAG.getUNDEF(VT);

44909

44910

if (ExtractIdx == SM_SentinelZero)

44911

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

44912

: DAG.getConstant(0, dl, VT);

44913

44914

SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

44915

ExtractIdx = ExtractIdx % Mask.size();

44916

if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

44917

return DAG.getZExtOrTrunc(V, dl, VT);

44918

44919

return SDValue();

44920

}

44921

44922

/// Extracting a scalar FP value from vector element 0 is free, so extract each

44923

/// operand first, then perform the math as a scalar op.

44924

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

44925

const X86Subtarget &Subtarget) {

44926

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44926, __extension__
__PRETTY_FUNCTION__));

44927

SDValue Vec = ExtElt->getOperand(0);

44928

SDValue Index = ExtElt->getOperand(1);

44929

EVT VT = ExtElt->getValueType(0);

44930

EVT VecVT = Vec.getValueType();

44931

44932

// TODO: If this is a unary/expensive/expand op, allow extraction from a

44933

// non-zero element because the shuffle+scalar op will be cheaper?

44934

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

44935

return SDValue();

44936

44937

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

44938

// extract, the condition code), so deal with those as a special-case.

44939

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

44940

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

44941

if (OpVT != MVT::f32 && OpVT != MVT::f64)

44942

return SDValue();

44943

44944

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

44945

SDLoc DL(ExtElt);

44946

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

44947

Vec.getOperand(0), Index);

44948

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

44949

Vec.getOperand(1), Index);

44950

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

44951

}

44952

44953

if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

44954

VT != MVT::f64)

44955

return SDValue();

44956

44957

// Vector FP selects don't fit the pattern of FP math ops (because the

44958

// condition has a different type and we have to change the opcode), so deal

44959

// with those here.

44960

// FIXME: This is restricted to pre type legalization by ensuring the setcc

44961

// has i1 elements. If we loosen this we need to convert vector bool to a

44962

// scalar bool.

44963

if (Vec.getOpcode() == ISD::VSELECT &&

44964

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

44965

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

44966

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

44967

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

44968

SDLoc DL(ExtElt);

44969

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

44970

Vec.getOperand(0).getValueType().getScalarType(),

44971

Vec.getOperand(0), Index);

44972

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

44973

Vec.getOperand(1), Index);

44974

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

44975

Vec.getOperand(2), Index);

44976

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

44977

}

44978

44979

// TODO: This switch could include FNEG and the x86-specific FP logic ops

44980

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

44981

// missed load folding and fma+fneg combining.

44982

switch (Vec.getOpcode()) {

44983

case ISD::FMA: // Begin 3 operands

44984

case ISD::FMAD:

44985

case ISD::FADD: // Begin 2 operands

44986

case ISD::FSUB:

44987

case ISD::FMUL:

44988

case ISD::FDIV:

44989

case ISD::FREM:

44990

case ISD::FCOPYSIGN:

44991

case ISD::FMINNUM:

44992

case ISD::FMAXNUM:

44993

case ISD::FMINNUM_IEEE:

44994

case ISD::FMAXNUM_IEEE:

44995

case ISD::FMAXIMUM:

44996

case ISD::FMINIMUM:

44997

case X86ISD::FMAX:

44998

case X86ISD::FMIN:

44999

case ISD::FABS: // Begin 1 operand

45000

case ISD::FSQRT:

45001

case ISD::FRINT:

45002

case ISD::FCEIL:

45003

case ISD::FTRUNC:

45004

case ISD::FNEARBYINT:

45005

case ISD::FROUND:

45006

case ISD::FFLOOR:

45007

case X86ISD::FRCP:

45008

case X86ISD::FRSQRT: {

45009

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

45010

SDLoc DL(ExtElt);

45011

SmallVector<SDValue, 4> ExtOps;

45012

for (SDValue Op : Vec->ops())

45013

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

45014

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

45015

}

45016

default:

45017

return SDValue();

45018

}

45019

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45019);

45020

}

45021

45022

/// Try to convert a vector reduction sequence composed of binops and shuffles

45023

/// into horizontal ops.

45024

static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

45025

const X86Subtarget &Subtarget) {

45026

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45026, __extension__
__PRETTY_FUNCTION__));

45027

45028

// We need at least SSE2 to anything here.

45029

if (!Subtarget.hasSSE2())

45030

return SDValue();

45031

45032

ISD::NodeType Opc;

45033

SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

45034

{ISD::ADD, ISD::MUL, ISD::FADD}, true);

45035

if (!Rdx)

45036

return SDValue();

45037

45038

SDValue Index = ExtElt->getOperand(1);

45039

assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45040, __extension__
__PRETTY_FUNCTION__))

45040

"Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45040, __extension__
__PRETTY_FUNCTION__));

45041

45042

EVT VT = ExtElt->getValueType(0);

45043

EVT VecVT = Rdx.getValueType();

45044

if (VecVT.getScalarType() != VT)

45045

return SDValue();

45046

45047

SDLoc DL(ExtElt);

45048

unsigned NumElts = VecVT.getVectorNumElements();

45049

unsigned EltSizeInBits = VecVT.getScalarSizeInBits();

45050

45051

// Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

45052

auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

45053

if (V.getValueType() == MVT::v4i8) {

45054

if (ZeroExtend && Subtarget.hasSSE41()) {

45055

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

45056

DAG.getConstant(0, DL, MVT::v4i32),

45057

DAG.getBitcast(MVT::i32, V),

45058

DAG.getIntPtrConstant(0, DL));

45059

return DAG.getBitcast(MVT::v16i8, V);

45060

}

45061

V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

45062

ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

45063

: DAG.getUNDEF(MVT::v4i8));

45064

}

45065

return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

45066

DAG.getUNDEF(MVT::v8i8));

45067

};

45068

45069

// vXi8 mul reduction - promote to vXi16 mul reduction.

45070

if (Opc == ISD::MUL) {

45071

if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

45072

return SDValue();

45073

if (VecVT.getSizeInBits() >= 128) {

45074

EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

45075

SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45076

SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45077

Lo = DAG.getBitcast(WideVT, Lo);

45078

Hi = DAG.getBitcast(WideVT, Hi);

45079

Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

45080

while (Rdx.getValueSizeInBits() > 128) {

45081

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45082

Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

45083

}

45084

} else {

45085

Rdx = WidenToV16I8(Rdx, false);

45086

Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

45087

Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

45088

}

45089

if (NumElts >= 8)

45090

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45091

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45092

{4, 5, 6, 7, -1, -1, -1, -1}));

45093

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45094

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45095

{2, 3, -1, -1, -1, -1, -1, -1}));

45096

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45097

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45098

{1, -1, -1, -1, -1, -1, -1, -1}));

45099

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45100

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45101

}

45102

45103

// vXi8 add reduction - sub 128-bit vector.

45104

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

45105

Rdx = WidenToV16I8(Rdx, true);

45106

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45107

DAG.getConstant(0, DL, MVT::v16i8));

45108

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45109

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45110

}

45111

45112

// Must be a >=128-bit vector with pow2 elements.

45113

if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

45114

return SDValue();

45115

45116

// vXi8 add reduction - sum lo/hi halves then use PSADBW.

45117

if (VT == MVT::i8) {

45118

while (Rdx.getValueSizeInBits() > 128) {

45119

SDValue Lo, Hi;

45120

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45121

VecVT = Lo.getValueType();

45122

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45123

}

45124

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45124, __extension__
__PRETTY_FUNCTION__));

45125

45126

SDValue Hi = DAG.getVectorShuffle(

45127

MVT::v16i8, DL, Rdx, Rdx,

45128

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

45129

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

45130

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45131

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

45132

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45133

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45134

}

45135

45136

// See if we can use vXi8 PSADBW add reduction for larger zext types.

45137

// If the source vector values are 0-255, then we can use PSADBW to

45138

// sum+zext v8i8 subvectors to vXi64, then perform the reduction.

45139

// TODO: See if its worth avoiding vXi16/i32 truncations?

45140

if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

45141

DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

45142

(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

45143

Subtarget.hasAVX512())) {

45144

EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

45145

Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

45146

if (ByteVT.getSizeInBits() < 128)

45147

Rdx = WidenToV16I8(Rdx, true);

45148

45149

// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

45150

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45151

ArrayRef<SDValue> Ops) {

45152

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

45153

SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

45154

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

45155

};

45156

MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

45157

Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);

45158

45159

// TODO: We could truncate to vXi16/vXi32 before performing the reduction.

45160

while (Rdx.getValueSizeInBits() > 128) {

45161

SDValue Lo, Hi;

45162

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45163

VecVT = Lo.getValueType();

45164

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45165

}

45166

assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45166, __extension__
__PRETTY_FUNCTION__));

45167

45168

if (NumElts > 8) {

45169

SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

45170

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

45171

}

45172

45173

VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

45174

Rdx = DAG.getBitcast(VecVT, Rdx);

45175

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45176

}

45177

45178

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

45179

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

45180

return SDValue();

45181

45182

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

45183

45184

// 256-bit horizontal instructions operate on 128-bit chunks rather than

45185

// across the whole vector, so we need an extract + hop preliminary stage.

45186

// This is the only step where the operands of the hop are not the same value.

45187

// TODO: We could extend this to handle 512-bit or even longer vectors.

45188

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

45189

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

45190

unsigned NumElts = VecVT.getVectorNumElements();

45191

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

45192

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

45193

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

45194

VecVT = Rdx.getValueType();

45195

}

45196

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

45197

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

45198

return SDValue();

45199

45200

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

45201

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

45202

for (unsigned i = 0; i != ReductionSteps; ++i)

45203

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

45204

45205

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45206

}

45207

45208

/// Detect vector gather/scatter index generation and convert it from being a

45209

/// bunch of shuffles and extracts into a somewhat faster sequence.

45210

/// For i686, the best sequence is apparently storing the value and loading

45211

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

45212

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

45213

TargetLowering::DAGCombinerInfo &DCI,

45214

const X86Subtarget &Subtarget) {

45215

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

45216

return NewOp;

45217

45218

SDValue InputVector = N->getOperand(0);

45219

SDValue EltIdx = N->getOperand(1);

45220

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

45221

45222

EVT SrcVT = InputVector.getValueType();

45223

EVT VT = N->getValueType(0);

45224

SDLoc dl(InputVector);

45225

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

45226

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45227

unsigned NumEltBits = VT.getScalarSizeInBits();

45228

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45229

45230

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

45231

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45232

45233

// Integer Constant Folding.

45234

if (CIdx && VT.isInteger()) {

45235

APInt UndefVecElts;

45236

SmallVector<APInt, 16> EltBits;

45237

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

45238

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

45239

EltBits, true, false)) {

45240

uint64_t Idx = CIdx->getZExtValue();

45241

if (UndefVecElts[Idx])

45242

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45243

return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);

45244

}

45245

45246

// Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).

45247

// Improves lowering of bool masks on rust which splits them into byte array.

45248

if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {

45249

SDValue Src = peekThroughBitcasts(InputVector);

45250

if (Src.getValueType().getScalarType() == MVT::i1 &&

45251

TLI.isTypeLegal(Src.getValueType())) {

45252

MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);

45253

SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,

45254

DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));

45255

return DAG.getBitcast(VT, Sub);

45256

}

45257

}

45258

}

45259

45260

if (IsPextr) {

45261

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

45262

DCI))

45263

return SDValue(N, 0);

45264

45265

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

45266

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

45267

InputVector.getOpcode() == X86ISD::PINSRW) &&

45268

InputVector.getOperand(2) == EltIdx) {

45269

assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45270, __extension__
__PRETTY_FUNCTION__))

45270

"Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45270, __extension__
__PRETTY_FUNCTION__));

45271

SDValue Scl = InputVector.getOperand(1);

45272

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

45273

return DAG.getZExtOrTrunc(Scl, dl, VT);

45274

}

45275

45276

// TODO - Remove this once we can handle the implicit zero-extension of

45277

// X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

45278

// combineBasicSADPattern.

45279

return SDValue();

45280

}

45281

45282

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

45283

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

45284

VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

45285

SDValue MMXSrc = InputVector.getOperand(0);

45286

45287

// The bitcast source is a direct mmx result.

45288

if (MMXSrc.getValueType() == MVT::x86mmx)

45289

return DAG.getBitcast(VT, InputVector);

45290

}

45291

45292

// Detect mmx to i32 conversion through a v2i32 elt extract.

45293

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

45294

VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {

45295

SDValue MMXSrc = InputVector.getOperand(0);

45296

45297

// The bitcast source is a direct mmx result.

45298

if (MMXSrc.getValueType() == MVT::x86mmx)

45299

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);

45300

}

45301

45302

// Check whether this extract is the root of a sum of absolute differences

45303

// pattern. This has to be done here because we really want it to happen

45304

// pre-legalization,

45305

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

45306

return SAD;

45307

45308

if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

45309

return VPDPBUSD;

45310

45311

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

45312

if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

45313

return Cmp;

45314

45315

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

45316

if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

45317

return MinMax;

45318

45319

// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

45320

if (SDValue V = combineArithReduction(N, DAG, Subtarget))

45321

return V;

45322

45323

if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))

45324

return V;

45325

45326

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

45327

// and then testing the relevant element.

45328

//

45329

// Note that we only combine extracts on the *same* result number, i.e.

45330

// t0 = merge_values a0, a1, a2, a3

45331

// i1 = extract_vector_elt t0, Constant:i64<2>

45332

// i1 = extract_vector_elt t0, Constant:i64<3>

45333

// but not

45334

// i1 = extract_vector_elt t0:1, Constant:i64<2>

45335

// since the latter would need its own MOVMSK.

45336

if (SrcVT.getScalarType() == MVT::i1) {

45337

bool IsVar = !CIdx;

45338

SmallVector<SDNode *, 16> BoolExtracts;

45339

unsigned ResNo = InputVector.getResNo();

45340

auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

45341

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

45342

Use->getOperand(0).getResNo() == ResNo &&

45343

Use->getValueType(0) == MVT::i1) {

45344

BoolExtracts.push_back(Use);

45345

IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

45346

return true;

45347

}

45348

return false;

45349

};

45350

// TODO: Can we drop the oneuse check for constant extracts?

45351

if (all_of(InputVector->uses(), IsBoolExtract) &&

45352

(IsVar || BoolExtracts.size() > 1)) {

45353

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

45354

if (SDValue BC =

45355

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

45356

for (SDNode *Use : BoolExtracts) {

45357

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

45358

// Mask = 1 << MaskIdx

45359

SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

45360

SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

45361

SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

45362

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

45363

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

45364

DCI.CombineTo(Use, Res);

45365

}

45366

return SDValue(N, 0);

45367

}

45368

}

45369

}

45370

45371

// If this extract is from a loaded vector value and will be used as an

45372

// integer, that requires a potentially expensive XMM -> GPR transfer.

45373

// Additionally, if we can convert to a scalar integer load, that will likely

45374

// be folded into a subsequent integer op.

45375

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

45376

// to a single-use of the loaded vector. For the reasons above, we

45377

// expect this to be profitable even if it creates an extra load.

45378

bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {

45379

return Use->getOpcode() == ISD::STORE ||

45380

Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

45381

Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

45382

});

45383

auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);

45384

if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

45385

SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&

45386

!LikelyUsedAsVector && LoadVec->isSimple()) {

45387

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45388

SDValue NewPtr =

45389

TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);

45390

unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;

45391

MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

45392

Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

45393

SDValue Load =

45394

DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

45395

LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

45396

DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

45397

return Load;

45398

}

45399

45400

return SDValue();

45401

}

45402

45403

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

45404

// This is more or less the reverse of combineBitcastvxi1.

45405

static SDValue combineToExtendBoolVectorInReg(

45406

unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

45407

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

45408

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

45409

Opcode != ISD::ANY_EXTEND)

45410

return SDValue();

45411

if (!DCI.isBeforeLegalizeOps())

45412

return SDValue();

45413

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

45414

return SDValue();

45415

45416

EVT SVT = VT.getScalarType();

45417

EVT InSVT = N0.getValueType().getScalarType();

45418

unsigned EltSizeInBits = SVT.getSizeInBits();

45419

45420

// Input type must be extending a bool vector (bit-casted from a scalar

45421

// integer) to legal integer types.

45422

if (!VT.isVector())

45423

return SDValue();

45424

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

45425

return SDValue();

45426

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

45427

return SDValue();

45428

45429

SDValue N00 = N0.getOperand(0);

45430

EVT SclVT = N00.getValueType();

45431

if (!SclVT.isScalarInteger())

45432

return SDValue();

45433

45434

SDValue Vec;

45435

SmallVector<int> ShuffleMask;

45436

unsigned NumElts = VT.getVectorNumElements();

45437

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45437, __extension__
__PRETTY_FUNCTION__));

45438

45439

// Broadcast the scalar integer to the vector elements.

45440

if (NumElts > EltSizeInBits) {

45441

// If the scalar integer is greater than the vector element size, then we

45442

// must split it down into sub-sections for broadcasting. For example:

45443

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

45444

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

45445

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45445, __extension__
__PRETTY_FUNCTION__));

45446

unsigned Scale = NumElts / EltSizeInBits;

45447

EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

45448

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

45449

Vec = DAG.getBitcast(VT, Vec);

45450

45451

for (unsigned i = 0; i != Scale; ++i)

45452

ShuffleMask.append(EltSizeInBits, i);

45453

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

45454

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

45455

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

45456

// If we have register broadcast instructions, use the scalar size as the

45457

// element type for the shuffle. Then cast to the wider element type. The

45458

// widened bits won't be used, and this might allow the use of a broadcast

45459

// load.

45460

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45460, __extension__
__PRETTY_FUNCTION__));

45461

unsigned Scale = EltSizeInBits / NumElts;

45462

EVT BroadcastVT =

45463

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

45464

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

45465

ShuffleMask.append(NumElts * Scale, 0);

45466

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

45467

Vec = DAG.getBitcast(VT, Vec);

45468

} else {

45469

// For smaller scalar integers, we can simply any-extend it to the vector

45470

// element size (we don't care about the upper bits) and broadcast it to all

45471

// elements.

45472

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

45473

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

45474

ShuffleMask.append(NumElts, 0);

45475

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

45476

}

45477

45478

// Now, mask the relevant bit in each element.

45479

SmallVector<SDValue, 32> Bits;

45480

for (unsigned i = 0; i != NumElts; ++i) {

45481

int BitIdx = (i % EltSizeInBits);

45482

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

45483

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

45484

}

45485

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

45486

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

45487

45488

// Compare against the bitmask and extend the result.

45489

EVT CCVT = VT.changeVectorElementType(MVT::i1);

45490

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

45491

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

45492

45493

// For SEXT, this is now done, otherwise shift the result down for

45494

// zero-extension.

45495

if (Opcode == ISD::SIGN_EXTEND)

45496

return Vec;

45497

return DAG.getNode(ISD::SRL, DL, VT, Vec,

45498

DAG.getConstant(EltSizeInBits - 1, DL, VT));

45499

}

45500

45501

/// If a vector select has an operand that is -1 or 0, try to simplify the

45502

/// select to a bitwise logic operation.

45503

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

45504

static SDValue

45505

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

45506

TargetLowering::DAGCombinerInfo &DCI,

45507

const X86Subtarget &Subtarget) {

45508

SDValue Cond = N->getOperand(0);

45509

SDValue LHS = N->getOperand(1);

45510

SDValue RHS = N->getOperand(2);

45511

EVT VT = LHS.getValueType();

45512

EVT CondVT = Cond.getValueType();

45513

SDLoc DL(N);

45514

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45515

45516

if (N->getOpcode() != ISD::VSELECT)

45517

return SDValue();

45518

45519

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45519, __extension__
__PRETTY_FUNCTION__));

45520

45521

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

45522

// TODO: Can we assert that both operands are not zeros (because that should

45523

// get simplified at node creation time)?

45524

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

45525

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

45526

45527

// If both inputs are 0/undef, create a complete zero vector.

45528

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

45529

if (TValIsAllZeros && FValIsAllZeros) {

45530

if (VT.isFloatingPoint())

45531

return DAG.getConstantFP(0.0, DL, VT);

45532

return DAG.getConstant(0, DL, VT);

45533

}

45534

45535

// To use the condition operand as a bitwise mask, it must have elements that

45536

// are the same size as the select elements. Ie, the condition operand must

45537

// have already been promoted from the IR select condition type <N x i1>.

45538

// Don't check if the types themselves are equal because that excludes

45539

// vector floating-point selects.

45540

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

45541

return SDValue();

45542

45543

// Try to invert the condition if true value is not all 1s and false value is

45544

// not all 0s. Only do this if the condition has one use.

45545

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

45546

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

45547

// Check if the selector will be produced by CMPP*/PCMP*.

45548

Cond.getOpcode() == ISD::SETCC &&

45549

// Check if SETCC has already been promoted.

45550

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

45551

CondVT) {

45552

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

45553

45554

if (TValIsAllZeros || FValIsAllOnes) {

45555

SDValue CC = Cond.getOperand(2);

45556

ISD::CondCode NewCC = ISD::getSetCCInverse(

45557

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

45558

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

45559

NewCC);

45560

std::swap(LHS, RHS);

45561

TValIsAllOnes = FValIsAllOnes;

45562

FValIsAllZeros = TValIsAllZeros;

45563

}

45564

}

45565

45566

// Cond value must be 'sign splat' to be converted to a logical op.

45567

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

45568

return SDValue();

45569

45570

// vselect Cond, 111..., 000... -> Cond

45571

if (TValIsAllOnes && FValIsAllZeros)

45572

return DAG.getBitcast(VT, Cond);

45573

45574

if (!TLI.isTypeLegal(CondVT))

45575

return SDValue();

45576

45577

// vselect Cond, 111..., X -> or Cond, X

45578

if (TValIsAllOnes) {

45579

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

45580

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

45581

return DAG.getBitcast(VT, Or);

45582

}

45583

45584

// vselect Cond, X, 000... -> and Cond, X

45585

if (FValIsAllZeros) {

45586

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

45587

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

45588

return DAG.getBitcast(VT, And);

45589

}

45590

45591

// vselect Cond, 000..., X -> andn Cond, X

45592

if (TValIsAllZeros) {

45593

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

45594

SDValue AndN;

45595

// The canonical form differs for i1 vectors - x86andnp is not used

45596

if (CondVT.getScalarType() == MVT::i1)

45597

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

45598

CastRHS);

45599

else

45600

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

45601

return DAG.getBitcast(VT, AndN);

45602

}

45603

45604

return SDValue();

45605

}

45606

45607

/// If both arms of a vector select are concatenated vectors, split the select,

45608

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

45609

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

45610

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

45611

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

45612

const X86Subtarget &Subtarget) {

45613

unsigned Opcode = N->getOpcode();

45614

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

45615

return SDValue();

45616

45617

// TODO: Split 512-bit vectors too?

45618

EVT VT = N->getValueType(0);

45619

if (!VT.is256BitVector())

45620

return SDValue();

45621

45622

// TODO: Split as long as any 2 of the 3 operands are concatenated?

45623

SDValue Cond = N->getOperand(0);

45624

SDValue TVal = N->getOperand(1);

45625

SDValue FVal = N->getOperand(2);

45626

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

45627

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

45628

!collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||

45629

!collectConcatOps(FVal.getNode(), CatOpsF, DAG))

45630

return SDValue();

45631

45632

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

45633

ArrayRef<SDValue> Ops) {

45634

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

45635

};

45636

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

45637

makeBlend, /*CheckBWI*/ false);

45638

}

45639

45640

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

45641

SDValue Cond = N->getOperand(0);

45642

SDValue LHS = N->getOperand(1);

45643

SDValue RHS = N->getOperand(2);

45644

SDLoc DL(N);

45645

45646

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

45647

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

45648

if (!TrueC || !FalseC)

45649

return SDValue();

45650

45651

// Don't do this for crazy integer types.

45652

EVT VT = N->getValueType(0);

45653

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

45654

return SDValue();

45655

45656

// We're going to use the condition bit in math or logic ops. We could allow

45657

// this with a wider condition value (post-legalization it becomes an i8),

45658

// but if nothing is creating selects that late, it doesn't matter.

45659

if (Cond.getValueType() != MVT::i1)

45660

return SDValue();

45661

45662

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

45663

// 3, 5, or 9 with i32/i64, so those get transformed too.

45664

// TODO: For constants that overflow or do not differ by power-of-2 or small

45665

// multiplier, convert to 'and' + 'add'.

45666

const APInt &TrueVal = TrueC->getAPIntValue();

45667

const APInt &FalseVal = FalseC->getAPIntValue();

45668

45669

// We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

45670

if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

45671

Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

45672

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

45673

if (CC == ISD::SETEQ || CC == ISD::SETNE)

45674

return SDValue();

45675

}

45676

45677

bool OV;

45678

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

45679

if (OV)

45680

return SDValue();

45681

45682

APInt AbsDiff = Diff.abs();

45683

if (AbsDiff.isPowerOf2() ||

45684

((VT == MVT::i32 || VT == MVT::i64) &&

45685

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

45686

45687

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

45688

// of the condition can usually be folded into a compare predicate, but even

45689

// without that, the sequence should be cheaper than a CMOV alternative.

45690

if (TrueVal.slt(FalseVal)) {

45691

Cond = DAG.getNOT(DL, Cond, MVT::i1);

45692

std::swap(TrueC, FalseC);

45693

}

45694

45695

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

45696

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

45697

45698

// Multiply condition by the difference if non-one.

45699

if (!AbsDiff.isOne())

45700

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

45701

45702

// Add the base if non-zero.

45703

if (!FalseC->isZero())

45704

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

45705

45706

return R;

45707

}

45708

45709

return SDValue();

45710

}

45711

45712

/// If this is a *dynamic* select (non-constant condition) and we can match

45713

/// this node with one of the variable blend instructions, restructure the

45714

/// condition so that blends can use the high (sign) bit of each element.

45715

/// This function will also call SimplifyDemandedBits on already created

45716

/// BLENDV to perform additional simplifications.

45717

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

45718

TargetLowering::DAGCombinerInfo &DCI,

45719

const X86Subtarget &Subtarget) {

45720

SDValue Cond = N->getOperand(0);

45721

if ((N->getOpcode() != ISD::VSELECT &&

45722

N->getOpcode() != X86ISD::BLENDV) ||

45723

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

45724

return SDValue();

45725

45726

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45727

unsigned BitWidth = Cond.getScalarValueSizeInBits();

45728

EVT VT = N->getValueType(0);

45729

45730

// We can only handle the cases where VSELECT is directly legal on the

45731

// subtarget. We custom lower VSELECT nodes with constant conditions and

45732

// this makes it hard to see whether a dynamic VSELECT will correctly

45733

// lower, so we both check the operation's status and explicitly handle the

45734

// cases where a *dynamic* blend will fail even though a constant-condition

45735

// blend could be custom lowered.

45736

// FIXME: We should find a better way to handle this class of problems.

45737

// Potentially, we should combine constant-condition vselect nodes

45738

// pre-legalization into shuffles and not mark as many types as custom

45739

// lowered.

45740

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

45741

return SDValue();

45742

// FIXME: We don't support i16-element blends currently. We could and

45743

// should support them by making *all* the bits in the condition be set

45744

// rather than just the high bit and using an i8-element blend.

45745

if (VT.getVectorElementType() == MVT::i16)

45746

return SDValue();

45747

// Dynamic blending was only available from SSE4.1 onward.

45748

if (VT.is128BitVector() && !Subtarget.hasSSE41())

45749

return SDValue();

45750

// Byte blends are only available in AVX2

45751

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

45752

return SDValue();

45753

// There are no 512-bit blend instructions that use sign bits.

45754

if (VT.is512BitVector())

45755

return SDValue();

45756

45757

// Don't optimize before the condition has been transformed to a legal type

45758

// and don't ever optimize vector selects that map to AVX512 mask-registers.

45759

if (BitWidth < 8 || BitWidth > 64)

45760

return SDValue();

45761

45762

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

45763

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

45764

UI != UE; ++UI)

45765

if ((UI->getOpcode() != ISD::VSELECT &&

45766

UI->getOpcode() != X86ISD::BLENDV) ||

45767

UI.getOperandNo() != 0)

45768

return false;

45769

45770

return true;

45771

};

45772

45773

APInt DemandedBits(APInt::getSignMask(BitWidth));

45774

45775

if (OnlyUsedAsSelectCond(Cond)) {

45776

KnownBits Known;

45777

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

45778

!DCI.isBeforeLegalizeOps());

45779

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

45780

return SDValue();

45781

45782

// If we changed the computation somewhere in the DAG, this change will

45783

// affect all users of Cond. Update all the nodes so that we do not use

45784

// the generic VSELECT anymore. Otherwise, we may perform wrong

45785

// optimizations as we messed with the actual expectation for the vector

45786

// boolean values.

45787

for (SDNode *U : Cond->uses()) {

45788

if (U->getOpcode() == X86ISD::BLENDV)

45789

continue;

45790

45791

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

45792

Cond, U->getOperand(1), U->getOperand(2));

45793

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

45794

DCI.AddToWorklist(U);

45795

}

45796

DCI.CommitTargetLoweringOpt(TLO);

45797

return SDValue(N, 0);

45798

}

45799

45800

// Otherwise we can still at least try to simplify multiple use bits.

45801

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

45802

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

45803

N->getOperand(1), N->getOperand(2));

45804

45805

return SDValue();

45806

}

45807

45808

// Try to match:

45809

// (or (and (M, (sub 0, X)), (pandn M, X)))

45810

// which is a special case of:

45811

// (select M, (sub 0, X), X)

45812

// Per:

45813

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

45814

// We know that, if fNegate is 0 or 1:

45815

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

45816

//

45817

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

45818

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

45819

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

45820

// This lets us transform our vselect to:

45821

// (add (xor X, M), (and M, 1))

45822

// And further to:

45823

// (sub (xor X, M), M)

45824

static SDValue combineLogicBlendIntoConditionalNegate(

45825

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

45826

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

45827

EVT MaskVT = Mask.getValueType();

45828

assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45830, __extension__
__PRETTY_FUNCTION__))

45829

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45830, __extension__
__PRETTY_FUNCTION__))

45830

"Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45830, __extension__
__PRETTY_FUNCTION__));

45831

45832

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

45833

return SDValue();

45834

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

45835

return SDValue();

45836

45837

auto IsNegV = [](SDNode *N, SDValue V) {

45838

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

45839

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

45840

};

45841

45842

SDValue V;

45843

if (IsNegV(Y.getNode(), X))

45844

V = X;

45845

else if (IsNegV(X.getNode(), Y))

45846

V = Y;

45847

else

45848

return SDValue();

45849

45850

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

45851

SDValue SubOp2 = Mask;

45852

45853

// If the negate was on the false side of the select, then

45854

// the operands of the SUB need to be swapped. PR 27251.

45855

// This is because the pattern being matched above is

45856

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

45857

// but if the pattern matched was

45858

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

45859

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

45860

// pattern also needs to be a negation of the replacement pattern above.

45861

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

45862

// sub accomplishes the negation of the replacement pattern.

45863

if (V == Y)

45864

std::swap(SubOp1, SubOp2);

45865

45866

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

45867

return DAG.getBitcast(VT, Res);

45868

}

45869

45870

/// Do target-specific dag combines on SELECT and VSELECT nodes.

45871

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

45872

TargetLowering::DAGCombinerInfo &DCI,

45873

const X86Subtarget &Subtarget) {

45874

SDLoc DL(N);

45875

SDValue Cond = N->getOperand(0);

45876

SDValue LHS = N->getOperand(1);

45877

SDValue RHS = N->getOperand(2);

45878

45879

// Try simplification again because we use this function to optimize

45880

// BLENDV nodes that are not handled by the generic combiner.

45881

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

45882

return V;

45883

45884

EVT VT = LHS.getValueType();

45885

EVT CondVT = Cond.getValueType();

45886

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45887

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

45888

45889

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

45890

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

45891

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

45892

if (CondVT.isVector() && CondVT.isInteger() &&

45893

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

45894

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

45895

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

45896

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

45897

DL, DAG, Subtarget))

45898

return V;

45899

45900

// Convert vselects with constant condition into shuffles.

45901

if (CondConstantVector && DCI.isBeforeLegalizeOps() &&

45902

(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {

45903

SmallVector<int, 64> Mask;

45904

if (createShuffleMaskFromVSELECT(Mask, Cond,

45905

N->getOpcode() == X86ISD::BLENDV))

45906

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

45907

}

45908

45909

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

45910

// by forcing the unselected elements to zero.

45911

// TODO: Can we handle more shuffles with this?

45912

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

45913

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

45914

LHS.hasOneUse() && RHS.hasOneUse()) {

45915

MVT SimpleVT = VT.getSimpleVT();

45916

SmallVector<SDValue, 1> LHSOps, RHSOps;

45917

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

45918

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

45919

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&

45920

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {

45921

int NumElts = VT.getVectorNumElements();

45922

for (int i = 0; i != NumElts; ++i) {

45923

// getConstVector sets negative shuffle mask values as undef, so ensure

45924

// we hardcode SM_SentinelZero values to zero (0x80).

45925

if (CondMask[i] < NumElts) {

45926

LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];

45927

RHSMask[i] = 0x80;

45928

} else {

45929

LHSMask[i] = 0x80;

45930

RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];

45931

}

45932

}

45933

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

45934

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

45935

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

45936

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

45937

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

45938

}

45939

}

45940

45941

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

45942

// instructions match the semantics of the common C idiom x<y?x:y but not

45943

// x<=y?x:y, because of how they handle negative zero (which can be

45944

// ignored in unsafe-math mode).

45945

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

45946

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

45947

VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&

45948

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

45949

(Subtarget.hasSSE2() ||

45950

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

45951

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

45952

45953

unsigned Opcode = 0;

45954

// Check for x CC y ? x : y.

45955

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

45956

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

45957

switch (CC) {

45958

default: break;

45959

case ISD::SETULT:

45960

// Converting this to a min would handle NaNs incorrectly, and swapping

45961

// the operands would cause it to handle comparisons between positive

45962

// and negative zero incorrectly.

45963

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

45964

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45965

!(DAG.isKnownNeverZeroFloat(LHS) ||

45966

DAG.isKnownNeverZeroFloat(RHS)))

45967

break;

45968

std::swap(LHS, RHS);

45969

}

45970

Opcode = X86ISD::FMIN;

45971

break;

45972

case ISD::SETOLE:

45973

// Converting this to a min would handle comparisons between positive

45974

// and negative zero incorrectly.

45975

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45976

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

45977

break;

45978

Opcode = X86ISD::FMIN;

45979

break;

45980

case ISD::SETULE:

45981

// Converting this to a min would handle both negative zeros and NaNs

45982

// incorrectly, but we can swap the operands to fix both.

45983

std::swap(LHS, RHS);

45984

[[fallthrough]];

45985

case ISD::SETOLT:

45986

case ISD::SETLT:

45987

case ISD::SETLE:

45988

Opcode = X86ISD::FMIN;

45989

break;

45990

45991

case ISD::SETOGE:

45992

// Converting this to a max would handle comparisons between positive

45993

// and negative zero incorrectly.

45994

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45995

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

45996

break;

45997

Opcode = X86ISD::FMAX;

45998

break;

45999

case ISD::SETUGT:

46000

// Converting this to a max would handle NaNs incorrectly, and swapping

46001

// the operands would cause it to handle comparisons between positive

46002

// and negative zero incorrectly.

46003

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46004

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46005

!(DAG.isKnownNeverZeroFloat(LHS) ||

46006

DAG.isKnownNeverZeroFloat(RHS)))

46007

break;

46008

std::swap(LHS, RHS);

46009

}

46010

Opcode = X86ISD::FMAX;

46011

break;

46012

case ISD::SETUGE:

46013

// Converting this to a max would handle both negative zeros and NaNs

46014

// incorrectly, but we can swap the operands to fix both.

46015

std::swap(LHS, RHS);

46016

[[fallthrough]];

46017

case ISD::SETOGT:

46018

case ISD::SETGT:

46019

case ISD::SETGE:

46020

Opcode = X86ISD::FMAX;

46021

break;

46022

}

46023

// Check for x CC y ? y : x -- a min/max with reversed arms.

46024

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

46025

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

46026

switch (CC) {

46027

default: break;

46028

case ISD::SETOGE:

46029

// Converting this to a min would handle comparisons between positive

46030

// and negative zero incorrectly, and swapping the operands would

46031

// cause it to handle NaNs incorrectly.

46032

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46033

!(DAG.isKnownNeverZeroFloat(LHS) ||

46034

DAG.isKnownNeverZeroFloat(RHS))) {

46035

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46036

break;

46037

std::swap(LHS, RHS);

46038

}

46039

Opcode = X86ISD::FMIN;

46040

break;

46041

case ISD::SETUGT:

46042

// Converting this to a min would handle NaNs incorrectly.

46043

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46044

break;

46045

Opcode = X86ISD::FMIN;

46046

break;

46047

case ISD::SETUGE:

46048

// Converting this to a min would handle both negative zeros and NaNs

46049

// incorrectly, but we can swap the operands to fix both.

46050

std::swap(LHS, RHS);

46051

[[fallthrough]];

46052

case ISD::SETOGT:

46053

case ISD::SETGT:

46054

case ISD::SETGE:

46055

Opcode = X86ISD::FMIN;

46056

break;

46057

46058

case ISD::SETULT:

46059

// Converting this to a max would handle NaNs incorrectly.

46060

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46061

break;

46062

Opcode = X86ISD::FMAX;

46063

break;

46064

case ISD::SETOLE:

46065

// Converting this to a max would handle comparisons between positive

46066

// and negative zero incorrectly, and swapping the operands would

46067

// cause it to handle NaNs incorrectly.

46068

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46069

!DAG.isKnownNeverZeroFloat(LHS) &&

46070

!DAG.isKnownNeverZeroFloat(RHS)) {

46071

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46072

break;

46073

std::swap(LHS, RHS);

46074

}

46075

Opcode = X86ISD::FMAX;

46076

break;

46077

case ISD::SETULE:

46078

// Converting this to a max would handle both negative zeros and NaNs

46079

// incorrectly, but we can swap the operands to fix both.

46080

std::swap(LHS, RHS);

46081

[[fallthrough]];

46082

case ISD::SETOLT:

46083

case ISD::SETLT:

46084

case ISD::SETLE:

46085

Opcode = X86ISD::FMAX;

46086

break;

46087

}

46088

}

46089

46090

if (Opcode)

46091

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

46092

}

46093

46094

// Some mask scalar intrinsics rely on checking if only one bit is set

46095

// and implement it in C code like this:

46096

// A[0] = (U & 1) ? A[0] : W[0];

46097

// This creates some redundant instructions that break pattern matching.

46098

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

46099

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

46100

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

46101

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46102

SDValue AndNode = Cond.getOperand(0);

46103

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

46104

isNullConstant(Cond.getOperand(1)) &&

46105

isOneConstant(AndNode.getOperand(1))) {

46106

// LHS and RHS swapped due to

46107

// setcc outputting 1 when AND resulted in 0 and vice versa.

46108

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

46109

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

46110

}

46111

}

46112

46113

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

46114

// lowering on KNL. In this case we convert it to

46115

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

46116

// The same situation all vectors of i8 and i16 without BWI.

46117

// Make sure we extend these even before type legalization gets a chance to

46118

// split wide vectors.

46119

// Since SKX these selects have a proper lowering.

46120

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

46121

CondVT.getVectorElementType() == MVT::i1 &&

46122

(VT.getVectorElementType() == MVT::i8 ||

46123

VT.getVectorElementType() == MVT::i16)) {

46124

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

46125

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

46126

}

46127

46128

// AVX512 - Extend select with zero to merge with target shuffle.

46129

// select(mask, extract_subvector(shuffle(x)), zero) -->

46130

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

46131

// TODO - support non target shuffles as well.

46132

if (Subtarget.hasAVX512() && CondVT.isVector() &&

46133

CondVT.getVectorElementType() == MVT::i1) {

46134

auto SelectableOp = [&TLI](SDValue Op) {

46135

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

46136

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

46137

isNullConstant(Op.getOperand(1)) &&

46138

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

46139

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

46140

};

46141

46142

bool SelectableLHS = SelectableOp(LHS);

46143

bool SelectableRHS = SelectableOp(RHS);

46144

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

46145

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

46146

46147

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

46148

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

46149

: RHS.getOperand(0).getValueType();

46150

EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

46151

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

46152

VT.getSizeInBits());

46153

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

46154

VT.getSizeInBits());

46155

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

46156

DAG.getUNDEF(SrcCondVT), Cond,

46157

DAG.getIntPtrConstant(0, DL));

46158

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

46159

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

46160

}

46161

}

46162

46163

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

46164

return V;

46165

46166

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

46167

Cond.hasOneUse()) {

46168

EVT CondVT = Cond.getValueType();

46169

SDValue Cond0 = Cond.getOperand(0);

46170

SDValue Cond1 = Cond.getOperand(1);

46171

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46172

46173

// Canonicalize min/max:

46174

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

46175

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

46176

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

46177

// the need for an extra compare against zero. e.g.

46178

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

46179

// subl %esi, %edi

46180

// testl %edi, %edi

46181

// movl $0, %eax

46182

// cmovgl %edi, %eax

46183

// =>

46184

// xorl %eax, %eax

46185

// subl %esi, $edi

46186

// cmovsl %eax, %edi

46187

//

46188

// We can also canonicalize

46189

// (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

46190

// (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

46191

// This allows the use of a test instruction for the compare.

46192

if (LHS == Cond0 && RHS == Cond1) {

46193

if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

46194

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

46195

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

46196

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

46197

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

46198

}

46199

if (CC == ISD::SETUGT && isOneConstant(RHS)) {

46200

ISD::CondCode NewCC = ISD::SETUGE;

46201

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

46202

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

46203

}

46204

}

46205

46206

// Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

46207

// fold eq + gt/lt nested selects into ge/le selects

46208

// select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

46209

// --> (select (cmpuge Cond0, Cond1), LHS, Y)

46210

// select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

46211

// --> (select (cmpsle Cond0, Cond1), LHS, Y)

46212

// .. etc ..

46213

if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

46214

RHS.getOperand(0).getOpcode() == ISD::SETCC) {

46215

SDValue InnerSetCC = RHS.getOperand(0);

46216

ISD::CondCode InnerCC =

46217

cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

46218

if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

46219

Cond0 == InnerSetCC.getOperand(0) &&

46220

Cond1 == InnerSetCC.getOperand(1)) {

46221

ISD::CondCode NewCC;

46222

switch (CC == ISD::SETEQ ? InnerCC : CC) {

46223

case ISD::SETGT: NewCC = ISD::SETGE; break;

46224

case ISD::SETLT: NewCC = ISD::SETLE; break;

46225

case ISD::SETUGT: NewCC = ISD::SETUGE; break;

46226

case ISD::SETULT: NewCC = ISD::SETULE; break;

46227

default: NewCC = ISD::SETCC_INVALID; break;

46228

}

46229

if (NewCC != ISD::SETCC_INVALID) {

46230

Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

46231

return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

46232

}

46233

}

46234

}

46235

}

46236

46237

// Check if the first operand is all zeros and Cond type is vXi1.

46238

// If this an avx512 target we can improve the use of zero masking by

46239

// swapping the operands and inverting the condition.

46240

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

46241

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

46242

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

46243

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

46244

// Invert the cond to not(cond) : xor(op,allones)=not(op)

46245

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

46246

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

46247

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

46248

}

46249

46250

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

46251

// get split by legalization.

46252

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

46253

CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&

46254

TLI.isTypeLegal(VT.getScalarType())) {

46255

EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

46256

if (SDValue ExtCond = combineToExtendBoolVectorInReg(

46257

ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

46258

ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

46259

return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

46260

}

46261

}

46262

46263

// Early exit check

46264

if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))

46265

return SDValue();

46266

46267

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

46268

return V;

46269

46270

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

46271

return V;

46272

46273

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

46274

return V;

46275

46276

// select(~Cond, X, Y) -> select(Cond, Y, X)

46277

if (CondVT.getScalarType() != MVT::i1) {

46278

if (SDValue CondNot = IsNOT(Cond, DAG))

46279

return DAG.getNode(N->getOpcode(), DL, VT,

46280

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

46281

46282

if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse()) {

46283

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the

46284

// signbit.

46285

if (ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {

46286

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

46287

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

46288

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

46289

}

46290

46291

// smin(LHS, RHS) : select(pcmpgt(RHS, LHS), LHS, RHS)

46292

// -> select(pcmpgt(LHS, RHS), RHS, LHS)

46293

// iff the commuted pcmpgt() already exists.

46294

// TODO: Could DAGCombiner::combine cse search for SETCC nodes, like it

46295

// does for commutative binops?

46296

if (Cond.getOperand(0) == RHS && Cond.getOperand(1) == LHS) {

46297

if (SDNode *FlipCond =

46298

DAG.getNodeIfExists(X86ISD::PCMPGT, DAG.getVTList(CondVT),

46299

{Cond.getOperand(1), Cond.getOperand(0)})) {

46300

return DAG.getNode(N->getOpcode(), DL, VT, SDValue(FlipCond, 0), RHS,

46301

LHS);

46302

}

46303

}

46304

}

46305

}

46306

46307

// Try to optimize vXi1 selects if both operands are either all constants or

46308

// bitcasts from scalar integer type. In that case we can convert the operands

46309

// to integer and use an integer select which will be converted to a CMOV.

46310

// We need to take a little bit of care to avoid creating an i64 type after

46311

// type legalization.

46312

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

46313

VT.getVectorElementType() == MVT::i1 &&

46314

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

46315

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

46316

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

46317

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

46318

46319

if ((LHSIsConst ||

46320

(LHS.getOpcode() == ISD::BITCAST &&

46321

LHS.getOperand(0).getValueType() == IntVT)) &&

46322

(RHSIsConst ||

46323

(RHS.getOpcode() == ISD::BITCAST &&

46324

RHS.getOperand(0).getValueType() == IntVT))) {

46325

if (LHSIsConst)

46326

LHS = combinevXi1ConstantToInteger(LHS, DAG);

46327

else

46328

LHS = LHS.getOperand(0);

46329

46330

if (RHSIsConst)

46331

RHS = combinevXi1ConstantToInteger(RHS, DAG);

46332

else

46333

RHS = RHS.getOperand(0);

46334

46335

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

46336

return DAG.getBitcast(VT, Select);

46337

}

46338

}

46339

46340

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

46341

// single bits, then invert the predicate and swap the select operands.

46342

// This can lower using a vector shift bit-hack rather than mask and compare.

46343

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

46344

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

46345

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

46346

Cond.getOperand(0).getOpcode() == ISD::AND &&

46347

isNullOrNullSplat(Cond.getOperand(1)) &&

46348

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

46349

Cond.getOperand(0).getValueType() == VT) {

46350

// The 'and' mask must be composed of power-of-2 constants.

46351

SDValue And = Cond.getOperand(0);

46352

auto *C = isConstOrConstSplat(And.getOperand(1));

46353

if (C && C->getAPIntValue().isPowerOf2()) {

46354

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

46355

SDValue NotCond =

46356

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

46357

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

46358

}

46359

46360

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

46361

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

46362

// 16-bit lacks a proper blendv.

46363

unsigned EltBitWidth = VT.getScalarSizeInBits();

46364

bool CanShiftBlend =

46365

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

46366

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

46367

(Subtarget.hasXOP()));

46368

if (CanShiftBlend &&

46369

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

46370

return C->getAPIntValue().isPowerOf2();

46371

})) {

46372

// Create a left-shift constant to get the mask bits over to the sign-bit.

46373

SDValue Mask = And.getOperand(1);

46374

SmallVector<int, 32> ShlVals;

46375

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

46376

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

46377

ShlVals.push_back(EltBitWidth - 1 -

46378

MaskVal->getAPIntValue().exactLogBase2());

46379

}

46380

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

46381

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

46382

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

46383

SDValue NewCond =

46384

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

46385

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

46386

}

46387

}

46388

46389

return SDValue();

46390

}

46391

46392

/// Combine:

46393

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

46394

/// to:

46395

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

46396

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

46397

/// Note that this is only legal for some op/cc combinations.

46398

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

46399

SelectionDAG &DAG,

46400

const X86Subtarget &Subtarget) {

46401

// This combine only operates on CMP-like nodes.

46402

if (!(Cmp.getOpcode() == X86ISD::CMP ||

46403

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

46404

return SDValue();

46405

46406

// Can't replace the cmp if it has more uses than the one we're looking at.

46407

// FIXME: We would like to be able to handle this, but would need to make sure

46408

// all uses were updated.

46409

if (!Cmp.hasOneUse())

46410

return SDValue();

46411

46412

// This only applies to variations of the common case:

46413

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

46414

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

46415

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

46416

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

46417

// Using the proper condcodes (see below), overflow is checked for.

46418

46419

// FIXME: We can generalize both constraints:

46420

// - XOR/OR/AND (if they were made to survive AtomicExpand)

46421

// - LHS != 1

46422

// if the result is compared.

46423

46424

SDValue CmpLHS = Cmp.getOperand(0);

46425

SDValue CmpRHS = Cmp.getOperand(1);

46426

EVT CmpVT = CmpLHS.getValueType();

46427

46428

if (!CmpLHS.hasOneUse())

46429

return SDValue();

46430

46431

unsigned Opc = CmpLHS.getOpcode();

46432

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

46433

return SDValue();

46434

46435

SDValue OpRHS = CmpLHS.getOperand(2);

46436

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

46437

if (!OpRHSC)

46438

return SDValue();

46439

46440

APInt Addend = OpRHSC->getAPIntValue();

46441

if (Opc == ISD::ATOMIC_LOAD_SUB)

46442

Addend = -Addend;

46443

46444

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

46445

if (!CmpRHSC)

46446

return SDValue();

46447

46448

APInt Comparison = CmpRHSC->getAPIntValue();

46449

APInt NegAddend = -Addend;

46450

46451

// See if we can adjust the CC to make the comparison match the negated

46452

// addend.

46453

if (Comparison != NegAddend) {

46454

APInt IncComparison = Comparison + 1;

46455

if (IncComparison == NegAddend) {

46456

if (CC == X86::COND_A && !Comparison.isMaxValue()) {

46457

Comparison = IncComparison;

46458

CC = X86::COND_AE;

46459

} else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

46460

Comparison = IncComparison;

46461

CC = X86::COND_L;

46462

}

46463

}

46464

APInt DecComparison = Comparison - 1;

46465

if (DecComparison == NegAddend) {

46466

if (CC == X86::COND_AE && !Comparison.isMinValue()) {

46467

Comparison = DecComparison;

46468

CC = X86::COND_A;

46469

} else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

46470

Comparison = DecComparison;

46471

CC = X86::COND_LE;

46472

}

46473

}

46474

}

46475

46476

// If the addend is the negation of the comparison value, then we can do

46477

// a full comparison by emitting the atomic arithmetic as a locked sub.

46478

if (Comparison == NegAddend) {

46479

// The CC is fine, but we need to rewrite the LHS of the comparison as an

46480

// atomic sub.

46481

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

46482

auto AtomicSub = DAG.getAtomic(

46483

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

46484

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

46485

/*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

46486

AN->getMemOperand());

46487

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

46488

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

46489

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

46490

return LockOp;

46491

}

46492

46493

// We can handle comparisons with zero in a number of cases by manipulating

46494

// the CC used.

46495

if (!Comparison.isZero())

46496

return SDValue();

46497

46498

if (CC == X86::COND_S && Addend == 1)

46499

CC = X86::COND_LE;

46500

else if (CC == X86::COND_NS && Addend == 1)

46501

CC = X86::COND_G;

46502

else if (CC == X86::COND_G && Addend == -1)

46503

CC = X86::COND_GE;

46504

else if (CC == X86::COND_LE && Addend == -1)

46505

CC = X86::COND_L;

46506

else

46507

return SDValue();

46508

46509

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

46510

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

46511

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

46512

return LockOp;

46513

}

46514

46515

// Check whether a boolean test is testing a boolean value generated by

46516

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

46517

// code.

46518

//

46519

// Simplify the following patterns:

46520

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

46521

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

46522

// to (Op EFLAGS Cond)

46523

//

46524

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

46525

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

46526

// to (Op EFLAGS !Cond)

46527

//

46528

// where Op could be BRCOND or CMOV.

46529

//

46530

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

46531

// This combine only operates on CMP-like nodes.

46532

if (!(Cmp.getOpcode() == X86ISD::CMP ||

46533

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

46534

return SDValue();

46535

46536

// Quit if not used as a boolean value.

46537

if (CC != X86::COND_E && CC != X86::COND_NE)

46538

return SDValue();

46539

46540

// Check CMP operands. One of them should be 0 or 1 and the other should be

46541

// an SetCC or extended from it.

46542

SDValue Op1 = Cmp.getOperand(0);

46543

SDValue Op2 = Cmp.getOperand(1);

46544

46545

SDValue SetCC;

46546

const ConstantSDNode* C = nullptr;

46547

bool needOppositeCond = (CC == X86::COND_E);

46548

bool checkAgainstTrue = false; // Is it a comparison against 1?

46549

46550

if ((C = dyn_cast<ConstantSDNode>(Op1)))

46551

SetCC = Op2;

46552

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

46553

SetCC = Op1;

46554

else // Quit if all operands are not constants.

46555

return SDValue();

46556

46557

if (C->getZExtValue() == 1) {

46558

needOppositeCond = !needOppositeCond;

46559

checkAgainstTrue = true;

46560

} else if (C->getZExtValue() != 0)

46561

// Quit if the constant is neither 0 or 1.

46562

return SDValue();

46563

46564

bool truncatedToBoolWithAnd = false;

46565

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

46566

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

46567

SetCC.getOpcode() == ISD::TRUNCATE ||

46568

SetCC.getOpcode() == ISD::AND) {

46569

if (SetCC.getOpcode() == ISD::AND) {

46570

int OpIdx = -1;

46571

if (isOneConstant(SetCC.getOperand(0)))

46572

OpIdx = 1;

46573

if (isOneConstant(SetCC.getOperand(1)))

46574

OpIdx = 0;

46575

if (OpIdx < 0)

46576

break;

46577

SetCC = SetCC.getOperand(OpIdx);

46578

truncatedToBoolWithAnd = true;

46579

} else

46580

SetCC = SetCC.getOperand(0);

46581

}

46582

46583

switch (SetCC.getOpcode()) {

46584

case X86ISD::SETCC_CARRY:

46585

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

46586

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

46587

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

46588

// truncated to i1 using 'and'.

46589

if (checkAgainstTrue && !truncatedToBoolWithAnd)

46590

break;

46591

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))

46592

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__));

46593

[[fallthrough]];

46594

case X86ISD::SETCC:

46595

// Set the condition code or opposite one if necessary.

46596

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

46597

if (needOppositeCond)

46598

CC = X86::GetOppositeBranchCondition(CC);

46599

return SetCC.getOperand(1);

46600

case X86ISD::CMOV: {

46601

// Check whether false/true value has canonical one, i.e. 0 or 1.

46602

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

46603

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

46604

// Quit if true value is not a constant.

46605

if (!TVal)

46606

return SDValue();

46607

// Quit if false value is not a constant.

46608

if (!FVal) {

46609

SDValue Op = SetCC.getOperand(0);

46610

// Skip 'zext' or 'trunc' node.

46611

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

46612

Op.getOpcode() == ISD::TRUNCATE)

46613

Op = Op.getOperand(0);

46614

// A special case for rdrand/rdseed, where 0 is set if false cond is

46615

// found.

46616

if ((Op.getOpcode() != X86ISD::RDRAND &&

46617

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

46618

return SDValue();

46619

}

46620

// Quit if false value is not the constant 0 or 1.

46621

bool FValIsFalse = true;

46622

if (FVal && FVal->getZExtValue() != 0) {

46623

if (FVal->getZExtValue() != 1)

46624

return SDValue();

46625

// If FVal is 1, opposite cond is needed.

46626

needOppositeCond = !needOppositeCond;

46627

FValIsFalse = false;

46628

}

46629

// Quit if TVal is not the constant opposite of FVal.

46630

if (FValIsFalse && TVal->getZExtValue() != 1)

46631

return SDValue();

46632

if (!FValIsFalse && TVal->getZExtValue() != 0)

46633

return SDValue();

46634

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

46635

if (needOppositeCond)

46636

CC = X86::GetOppositeBranchCondition(CC);

46637

return SetCC.getOperand(3);

46638

}

46639

}

46640

46641

return SDValue();

46642

}

46643

46644

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

46645

/// Match:

46646

/// (X86or (X86setcc) (X86setcc))

46647

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

46648

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

46649

X86::CondCode &CC1, SDValue &Flags,

46650

bool &isAnd) {

46651

if (Cond->getOpcode() == X86ISD::CMP) {

46652

if (!isNullConstant(Cond->getOperand(1)))

46653

return false;

46654

46655

Cond = Cond->getOperand(0);

46656

}

46657

46658

isAnd = false;

46659

46660

SDValue SetCC0, SetCC1;

46661

switch (Cond->getOpcode()) {

46662

default: return false;

46663

case ISD::AND:

46664

case X86ISD::AND:

46665

isAnd = true;

46666

[[fallthrough]];

46667

case ISD::OR:

46668

case X86ISD::OR:

46669

SetCC0 = Cond->getOperand(0);

46670

SetCC1 = Cond->getOperand(1);

46671

break;

46672

};

46673

46674

// Make sure we have SETCC nodes, using the same flags value.

46675

if (SetCC0.getOpcode() != X86ISD::SETCC ||

46676

SetCC1.getOpcode() != X86ISD::SETCC ||

46677

SetCC0->getOperand(1) != SetCC1->getOperand(1))

46678

return false;

46679

46680

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

46681

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

46682

Flags = SetCC0->getOperand(1);

46683

return true;

46684

}

46685

46686

// When legalizing carry, we create carries via add X, -1

46687

// If that comes from an actual carry, via setcc, we use the

46688

// carry directly.

46689

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

46690

if (EFLAGS.getOpcode() == X86ISD::ADD) {

46691

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

46692

bool FoundAndLSB = false;

46693

SDValue Carry = EFLAGS.getOperand(0);

46694

while (Carry.getOpcode() == ISD::TRUNCATE ||

46695

Carry.getOpcode() == ISD::ZERO_EXTEND ||

46696

(Carry.getOpcode() == ISD::AND &&

46697

isOneConstant(Carry.getOperand(1)))) {

46698

FoundAndLSB |= Carry.getOpcode() == ISD::AND;

46699

Carry = Carry.getOperand(0);

46700

}

46701

if (Carry.getOpcode() == X86ISD::SETCC ||

46702

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

46703

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

46704

uint64_t CarryCC = Carry.getConstantOperandVal(0);

46705

SDValue CarryOp1 = Carry.getOperand(1);

46706

if (CarryCC == X86::COND_B)

46707

return CarryOp1;

46708

if (CarryCC == X86::COND_A) {

46709

// Try to convert COND_A into COND_B in an attempt to facilitate

46710

// materializing "setb reg".

46711

//

46712

// Do not flip "e > c", where "c" is a constant, because Cmp

46713

// instruction cannot take an immediate as its first operand.

46714

//

46715

if (CarryOp1.getOpcode() == X86ISD::SUB &&

46716

CarryOp1.getNode()->hasOneUse() &&

46717

CarryOp1.getValueType().isInteger() &&

46718

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

46719

SDValue SubCommute =

46720

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

46721

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

46722

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

46723

}

46724

}

46725

// If this is a check of the z flag of an add with 1, switch to the

46726

// C flag.

46727

if (CarryCC == X86::COND_E &&

46728

CarryOp1.getOpcode() == X86ISD::ADD &&

46729

isOneConstant(CarryOp1.getOperand(1)))

46730

return CarryOp1;

46731

} else if (FoundAndLSB) {

46732

SDLoc DL(Carry);

46733

SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

46734

if (Carry.getOpcode() == ISD::SRL) {

46735

BitNo = Carry.getOperand(1);

46736

Carry = Carry.getOperand(0);

46737

}

46738

return getBT(Carry, BitNo, DL, DAG);

46739

}

46740

}

46741

}

46742

46743

return SDValue();

46744

}

46745

46746

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

46747

/// to avoid the inversion.

46748

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

46749

SelectionDAG &DAG,

46750

const X86Subtarget &Subtarget) {

46751

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

46752

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

46753

EFLAGS.getOpcode() != X86ISD::TESTP)

46754

return SDValue();

46755

46756

// PTEST/TESTP sets EFLAGS as:

46757

// TESTZ: ZF = (Op0 & Op1) == 0

46758

// TESTC: CF = (~Op0 & Op1) == 0

46759

// TESTNZC: ZF == 0 && CF == 0

46760

EVT VT = EFLAGS.getValueType();

46761

SDValue Op0 = EFLAGS.getOperand(0);

46762

SDValue Op1 = EFLAGS.getOperand(1);

46763

EVT OpVT = Op0.getValueType();

46764

46765

// TEST*(~X,Y) == TEST*(X,Y)

46766

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

46767

X86::CondCode InvCC;

46768

switch (CC) {

46769

case X86::COND_B:

46770

// testc -> testz.

46771

InvCC = X86::COND_E;

46772

break;

46773

case X86::COND_AE:

46774

// !testc -> !testz.

46775

InvCC = X86::COND_NE;

46776

break;

46777

case X86::COND_E:

46778

// testz -> testc.

46779

InvCC = X86::COND_B;

46780

break;

46781

case X86::COND_NE:

46782

// !testz -> !testc.

46783

InvCC = X86::COND_AE;

46784

break;

46785

case X86::COND_A:

46786

case X86::COND_BE:

46787

// testnzc -> testnzc (no change).

46788

InvCC = CC;

46789

break;

46790

default:

46791

InvCC = X86::COND_INVALID;

46792

break;

46793

}

46794

46795

if (InvCC != X86::COND_INVALID) {

46796

CC = InvCC;

46797

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46798

DAG.getBitcast(OpVT, NotOp0), Op1);

46799

}

46800

}

46801

46802

if (CC == X86::COND_E || CC == X86::COND_NE) {

46803

// TESTZ(X,~Y) == TESTC(Y,X)

46804

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

46805

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

46806

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46807

DAG.getBitcast(OpVT, NotOp1), Op0);

46808

}

46809

46810

if (Op0 == Op1) {

46811

SDValue BC = peekThroughBitcasts(Op0);

46812

EVT BCVT = BC.getValueType();

46813

assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46814, __extension__
__PRETTY_FUNCTION__))

46814

"Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46814, __extension__
__PRETTY_FUNCTION__));

46815

46816

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

46817

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

46818

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46819

DAG.getBitcast(OpVT, BC.getOperand(0)),

46820

DAG.getBitcast(OpVT, BC.getOperand(1)));

46821

}

46822

46823

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

46824

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

46825

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

46826

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46827

DAG.getBitcast(OpVT, BC.getOperand(0)),

46828

DAG.getBitcast(OpVT, BC.getOperand(1)));

46829

}

46830

46831

// If every element is an all-sign value, see if we can use MOVMSK to

46832

// more efficiently extract the sign bits and compare that.

46833

// TODO: Handle TESTC with comparison inversion.

46834

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

46835

// MOVMSK combines to make sure its never worse than PTEST?

46836

unsigned EltBits = BCVT.getScalarSizeInBits();

46837

if (DAG.ComputeNumSignBits(BC) == EltBits) {

46838

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46838, __extension__
__PRETTY_FUNCTION__));

46839

APInt SignMask = APInt::getSignMask(EltBits);

46840

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46841

if (SDValue Res =

46842

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

46843

// For vXi16 cases we need to use pmovmksb and extract every other

46844

// sign bit.

46845

SDLoc DL(EFLAGS);

46846

if (EltBits == 16) {

46847

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

46848

Res = DAG.getBitcast(MovmskVT, Res);

46849

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

46850

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

46851

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

46852

} else {

46853

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

46854

}

46855

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

46856

DAG.getConstant(0, DL, MVT::i32));

46857

}

46858

}

46859

}

46860

46861

// TESTZ(-1,X) == TESTZ(X,X)

46862

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

46863

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

46864

46865

// TESTZ(X,-1) == TESTZ(X,X)

46866

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

46867

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

46868

46869

// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

46870

// TODO: Add COND_NE handling?

46871

if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

46872

SDValue Src0 = peekThroughBitcasts(Op0);

46873

SDValue Src1 = peekThroughBitcasts(Op1);

46874

if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

46875

Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

46876

peekThroughBitcasts(Src0.getOperand(1)), true);

46877

Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

46878

peekThroughBitcasts(Src1.getOperand(1)), true);

46879

if (Src0 && Src1)

46880

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46881

DAG.getBitcast(MVT::v4i64, Src0),

46882

DAG.getBitcast(MVT::v4i64, Src1));

46883

}

46884

}

46885

}

46886

46887

return SDValue();

46888

}

46889

46890

// Attempt to simplify the MOVMSK input based on the comparison type.

46891

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

46892

SelectionDAG &DAG,

46893

const X86Subtarget &Subtarget) {

46894

// Handle eq/ne against zero (any_of).

46895

// Handle eq/ne against -1 (all_of).

46896

if (!(CC == X86::COND_E || CC == X86::COND_NE))

46897

return SDValue();

46898

if (EFLAGS.getValueType() != MVT::i32)

46899

return SDValue();

46900

unsigned CmpOpcode = EFLAGS.getOpcode();

46901

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

46902

return SDValue();

46903

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

46904

if (!CmpConstant)

46905

return SDValue();

46906

const APInt &CmpVal = CmpConstant->getAPIntValue();

46907

46908

SDValue CmpOp = EFLAGS.getOperand(0);

46909

unsigned CmpBits = CmpOp.getValueSizeInBits();

46910

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46910, __extension__
__PRETTY_FUNCTION__));

46911

46912

// Peek through any truncate.

46913

if (CmpOp.getOpcode() == ISD::TRUNCATE)

46914

CmpOp = CmpOp.getOperand(0);

46915

46916

// Bail if we don't find a MOVMSK.

46917

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

46918

return SDValue();

46919

46920

SDValue Vec = CmpOp.getOperand(0);

46921

MVT VecVT = Vec.getSimpleValueType();

46922

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46923, __extension__
__PRETTY_FUNCTION__))

46923

"Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46923, __extension__
__PRETTY_FUNCTION__));

46924

unsigned NumElts = VecVT.getVectorNumElements();

46925

unsigned NumEltBits = VecVT.getScalarSizeInBits();

46926

46927

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

46928

bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

46929

NumElts <= CmpBits && CmpVal.isMask(NumElts);

46930

if (!IsAnyOf && !IsAllOf)

46931

return SDValue();

46932

46933

// TODO: Check more combining cases for me.

46934

// Here we check the cmp use number to decide do combining or not.

46935

// Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"

46936

// and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.

46937

bool IsOneUse = CmpOp.getNode()->hasOneUse();

46938

46939

// See if we can peek through to a vector with a wider element type, if the

46940

// signbits extend down to all the sub-elements as well.

46941

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

46942

// potential SimplifyDemandedBits/Elts cases.

46943

// If we looked through a truncate that discard bits, we can't do this

46944

// transform.

46945

// FIXME: We could do this transform for truncates that discarded bits by

46946

// inserting an AND mask between the new MOVMSK and the CMP.

46947

if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

46948

SDValue BC = peekThroughBitcasts(Vec);

46949

MVT BCVT = BC.getSimpleValueType();

46950

unsigned BCNumElts = BCVT.getVectorNumElements();

46951

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

46952

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

46953

BCNumEltBits > NumEltBits &&

46954

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

46955

SDLoc DL(EFLAGS);

46956

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

46957

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

46958

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

46959

DAG.getConstant(CmpMask, DL, MVT::i32));

46960

}

46961

}

46962

46963

// MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).

46964

// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).

46965

// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).

46966

// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).

46967

if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {

46968

SmallVector<SDValue> Ops;

46969

if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&

46970

Ops.size() == 2) {

46971

SDLoc DL(EFLAGS);

46972

EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

46973

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

46974

SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

46975

DAG.getBitcast(SubVT, Ops[0]),

46976

DAG.getBitcast(SubVT, Ops[1]));

46977

V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

46978

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

46979

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

46980

DAG.getConstant(CmpMask, DL, MVT::i32));

46981

}

46982

}

46983

46984

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

46985

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

46986

// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).

46987

// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).

46988

if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {

46989

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

46990

SDValue BC = peekThroughBitcasts(Vec);

46991

// Ensure MOVMSK was testing every signbit of BC.

46992

if (BC.getValueType().getVectorNumElements() <= NumElts) {

46993

if (BC.getOpcode() == X86ISD::PCMPEQ) {

46994

SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),

46995

BC.getOperand(0), BC.getOperand(1));

46996

V = DAG.getBitcast(TestVT, V);

46997

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

46998

}

46999

// Check for 256-bit split vector cases.

47000

if (BC.getOpcode() == ISD::AND &&

47001

BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

47002

BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

47003

SDValue LHS = BC.getOperand(0);

47004

SDValue RHS = BC.getOperand(1);

47005

LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),

47006

LHS.getOperand(0), LHS.getOperand(1));

47007

RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),

47008

RHS.getOperand(0), RHS.getOperand(1));

47009

LHS = DAG.getBitcast(TestVT, LHS);

47010

RHS = DAG.getBitcast(TestVT, RHS);

47011

SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

47012

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47013

}

47014

}

47015

}

47016

47017

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

47018

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

47019

// sign bits prior to the comparison with zero unless we know that

47020

// the vXi16 splats the sign bit down to the lower i8 half.

47021

// TODO: Handle all_of patterns.

47022

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

47023

SDValue VecOp0 = Vec.getOperand(0);

47024

SDValue VecOp1 = Vec.getOperand(1);

47025

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

47026

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

47027

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

47028

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

47029

SDLoc DL(EFLAGS);

47030

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

47031

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47032

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

47033

if (!SignExt0) {

47034

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

47035

DAG.getConstant(0xAAAA, DL, MVT::i16));

47036

}

47037

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47038

DAG.getConstant(0, DL, MVT::i16));

47039

}

47040

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

47041

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

47042

if (CmpBits >= 16 && Subtarget.hasInt256() &&

47043

(IsAnyOf || (SignExt0 && SignExt1))) {

47044

if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

47045

SDLoc DL(EFLAGS);

47046

SDValue Result = peekThroughBitcasts(Src);

47047

if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

47048

Result.getValueType().getVectorNumElements() <= NumElts) {

47049

SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),

47050

Result.getOperand(0), Result.getOperand(1));

47051

V = DAG.getBitcast(MVT::v4i64, V);

47052

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47053

}

47054

Result = DAG.getBitcast(MVT::v32i8, Result);

47055

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47056

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

47057

if (!SignExt0 || !SignExt1) {

47058

assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47059, __extension__
__PRETTY_FUNCTION__))

47059

"Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47059, __extension__
__PRETTY_FUNCTION__));

47060

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

47061

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47062

}

47063

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47064

DAG.getConstant(CmpMask, DL, MVT::i32));

47065

}

47066

}

47067

}

47068

47069

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

47070

SmallVector<int, 32> ShuffleMask;

47071

SmallVector<SDValue, 2> ShuffleInputs;

47072

if (NumElts <= CmpBits &&

47073

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

47074

ShuffleMask, DAG) &&

47075

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

47076

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

47077

unsigned NumShuffleElts = ShuffleMask.size();

47078

APInt DemandedElts = APInt::getZero(NumShuffleElts);

47079

for (int M : ShuffleMask) {

47080

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47080, __extension__
__PRETTY_FUNCTION__));

47081

DemandedElts.setBit(M);

47082

}

47083

if (DemandedElts.isAllOnes()) {

47084

SDLoc DL(EFLAGS);

47085

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

47086

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47087

Result =

47088

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

47089

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47090

EFLAGS.getOperand(1));

47091

}

47092

}

47093

47094

return SDValue();

47095

}

47096

47097

/// Optimize an EFLAGS definition used according to the condition code \p CC

47098

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

47099

/// uses of chain values.

47100

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

47101

SelectionDAG &DAG,

47102

const X86Subtarget &Subtarget) {

47103

if (CC == X86::COND_B)

47104

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

47105

return Flags;

47106

47107

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

47108

return R;

47109

47110

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

47111

return R;

47112

47113

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

47114

return R;

47115

47116

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

47117

}

47118

47119

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

47120

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

47121

TargetLowering::DAGCombinerInfo &DCI,

47122

const X86Subtarget &Subtarget) {

47123

SDLoc DL(N);

47124

47125

SDValue FalseOp = N->getOperand(0);

47126

SDValue TrueOp = N->getOperand(1);

47127

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

47128

SDValue Cond = N->getOperand(3);

47129

47130

// cmov X, X, ?, ? --> X

47131

if (TrueOp == FalseOp)

47132

return TrueOp;

47133

47134

// Try to simplify the EFLAGS and condition code operands.

47135

// We can't always do this as FCMOV only supports a subset of X86 cond.

47136

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

47137

if (!(FalseOp.getValueType() == MVT::f80 ||

47138

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

47139

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

47140

!Subtarget.canUseCMOV() || hasFPCMov(CC)) {

47141

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

47142

Flags};

47143

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47144

}

47145

}

47146

47147

// If this is a select between two integer constants, try to do some

47148

// optimizations. Note that the operands are ordered the opposite of SELECT

47149

// operands.

47150

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

47151

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

47152

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

47153

// larger than FalseC (the false value).

47154

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

47155

CC = X86::GetOppositeBranchCondition(CC);

47156

std::swap(TrueC, FalseC);

47157

std::swap(TrueOp, FalseOp);

47158

}

47159

47160

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

47161

// This is efficient for any integer data type (including i8/i16) and

47162

// shift amount.

47163

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

47164

Cond = getSETCC(CC, Cond, DL, DAG);

47165

47166

// Zero extend the condition if needed.

47167

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

47168

47169

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

47170

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

47171

DAG.getConstant(ShAmt, DL, MVT::i8));

47172

return Cond;

47173

}

47174

47175

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

47176

// for any integer data type, including i8/i16.

47177

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

47178

Cond = getSETCC(CC, Cond, DL, DAG);

47179

47180

// Zero extend the condition if needed.

47181

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

47182

FalseC->getValueType(0), Cond);

47183

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47184

SDValue(FalseC, 0));

47185

return Cond;

47186

}

47187

47188

// Optimize cases that will turn into an LEA instruction. This requires

47189

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

47190

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

47191

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

47192

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47193, __extension__
__PRETTY_FUNCTION__))

47193

"Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47193, __extension__
__PRETTY_FUNCTION__));

47194

47195

bool isFastMultiplier = false;

47196

if (Diff.ult(10)) {

47197

switch (Diff.getZExtValue()) {

47198

default: break;

47199

case 1: // result = add base, cond

47200

case 2: // result = lea base( , cond*2)

47201

case 3: // result = lea base(cond, cond*2)

47202

case 4: // result = lea base( , cond*4)

47203

case 5: // result = lea base(cond, cond*4)

47204

case 8: // result = lea base( , cond*8)

47205

case 9: // result = lea base(cond, cond*8)

47206

isFastMultiplier = true;

47207

break;

47208

}

47209

}

47210

47211

if (isFastMultiplier) {

47212

Cond = getSETCC(CC, Cond, DL ,DAG);

47213

// Zero extend the condition if needed.

47214

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

47215

Cond);

47216

// Scale the condition by the difference.

47217

if (Diff != 1)

47218

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

47219

DAG.getConstant(Diff, DL, Cond.getValueType()));

47220

47221

// Add the base if non-zero.

47222

if (FalseC->getAPIntValue() != 0)

47223

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47224

SDValue(FalseC, 0));

47225

return Cond;

47226

}

47227

}

47228

}

47229

}

47230

47231

// Handle these cases:

47232

// (select (x != c), e, c) -> select (x != c), e, x),

47233

// (select (x == c), c, e) -> select (x == c), x, e)

47234

// where the c is an integer constant, and the "select" is the combination

47235

// of CMOV and CMP.

47236

//

47237

// The rationale for this change is that the conditional-move from a constant

47238

// needs two instructions, however, conditional-move from a register needs

47239

// only one instruction.

47240

//

47241

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

47242

// some instruction-combining opportunities. This opt needs to be

47243

// postponed as late as possible.

47244

//

47245

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

47246

// the DCI.xxxx conditions are provided to postpone the optimization as

47247

// late as possible.

47248

47249

ConstantSDNode *CmpAgainst = nullptr;

47250

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

47251

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

47252

!isa<ConstantSDNode>(Cond.getOperand(0))) {

47253

47254

if (CC == X86::COND_NE &&

47255

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

47256

CC = X86::GetOppositeBranchCondition(CC);

47257

std::swap(TrueOp, FalseOp);

47258

}

47259

47260

if (CC == X86::COND_E &&

47261

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

47262

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

47263

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

47264

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47265

}

47266

}

47267

}

47268

47269

// Fold and/or of setcc's to double CMOV:

47270

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

47271

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

47272

//

47273

// This combine lets us generate:

47274

// cmovcc1 (jcc1 if we don't have CMOV)

47275

// cmovcc2 (same)

47276

// instead of:

47277

// setcc1

47278

// setcc2

47279

// and/or

47280

// cmovne (jne if we don't have CMOV)

47281

// When we can't use the CMOV instruction, it might increase branch

47282

// mispredicts.

47283

// When we can use CMOV, or when there is no mispredict, this improves

47284

// throughput and reduces register pressure.

47285

//

47286

if (CC == X86::COND_NE) {

47287

SDValue Flags;

47288

X86::CondCode CC0, CC1;

47289

bool isAndSetCC;

47290

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

47291

if (isAndSetCC) {

47292

std::swap(FalseOp, TrueOp);

47293

CC0 = X86::GetOppositeBranchCondition(CC0);

47294

CC1 = X86::GetOppositeBranchCondition(CC1);

47295

}

47296

47297

SDValue LOps[] = {FalseOp, TrueOp,

47298

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

47299

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

47300

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

47301

Flags};

47302

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47303

return CMOV;

47304

}

47305

}

47306

47307

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

47308

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

47309

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

47310

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

47311

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

47312

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

47313

SDValue Add = TrueOp;

47314

SDValue Const = FalseOp;

47315

// Canonicalize the condition code for easier matching and output.

47316

if (CC == X86::COND_E)

47317

std::swap(Add, Const);

47318

47319

// We might have replaced the constant in the cmov with the LHS of the

47320

// compare. If so change it to the RHS of the compare.

47321

if (Const == Cond.getOperand(0))

47322

Const = Cond.getOperand(1);

47323

47324

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

47325

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

47326

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

47327

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

47328

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

47329

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

47330

EVT VT = N->getValueType(0);

47331

// This should constant fold.

47332

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

47333

SDValue CMov =

47334

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

47335

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

47336

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

47337

}

47338

}

47339

47340

return SDValue();

47341

}

47342

47343

/// Different mul shrinking modes.

47344

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

47345

47346

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

47347

EVT VT = N->getOperand(0).getValueType();

47348

if (VT.getScalarSizeInBits() != 32)

47349

return false;

47350

47351

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47351, __extension__
__PRETTY_FUNCTION__));

47352

unsigned SignBits[2] = {1, 1};

47353

bool IsPositive[2] = {false, false};

47354

for (unsigned i = 0; i < 2; i++) {

47355

SDValue Opd = N->getOperand(i);

47356

47357

SignBits[i] = DAG.ComputeNumSignBits(Opd);

47358

IsPositive[i] = DAG.SignBitIsZero(Opd);

47359

}

47360

47361

bool AllPositive = IsPositive[0] && IsPositive[1];

47362

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

47363

// When ranges are from -128 ~ 127, use MULS8 mode.

47364

if (MinSignBits >= 25)

47365

Mode = ShrinkMode::MULS8;

47366

// When ranges are from 0 ~ 255, use MULU8 mode.

47367

else if (AllPositive && MinSignBits >= 24)

47368

Mode = ShrinkMode::MULU8;

47369

// When ranges are from -32768 ~ 32767, use MULS16 mode.

47370

else if (MinSignBits >= 17)

47371

Mode = ShrinkMode::MULS16;

47372

// When ranges are from 0 ~ 65535, use MULU16 mode.

47373

else if (AllPositive && MinSignBits >= 16)

47374

Mode = ShrinkMode::MULU16;

47375

else

47376

return false;

47377

return true;

47378

}

47379

47380

/// When the operands of vector mul are extended from smaller size values,

47381

/// like i8 and i16, the type of mul may be shrinked to generate more

47382

/// efficient code. Two typical patterns are handled:

47383

/// Pattern1:

47384

/// %2 = sext/zext <N x i8> %1 to <N x i32>

47385

/// %4 = sext/zext <N x i8> %3 to <N x i32>

47386

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47387

/// %5 = mul <N x i32> %2, %4

47388

///

47389

/// Pattern2:

47390

/// %2 = zext/sext <N x i16> %1 to <N x i32>

47391

/// %4 = zext/sext <N x i16> %3 to <N x i32>

47392

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47393

/// %5 = mul <N x i32> %2, %4

47394

///

47395

/// There are four mul shrinking modes:

47396

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

47397

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

47398

/// generate pmullw+sext32 for it (MULS8 mode).

47399

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

47400

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

47401

/// generate pmullw+zext32 for it (MULU8 mode).

47402

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

47403

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

47404

/// generate pmullw+pmulhw for it (MULS16 mode).

47405

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

47406

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

47407

/// generate pmullw+pmulhuw for it (MULU16 mode).

47408

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

47409

const X86Subtarget &Subtarget) {

47410

// Check for legality

47411

// pmullw/pmulhw are not supported by SSE.

47412

if (!Subtarget.hasSSE2())

47413

return SDValue();

47414

47415

// Check for profitability

47416

// pmulld is supported since SSE41. It is better to use pmulld

47417

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

47418

// the expansion.

47419

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

47420

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

47421

return SDValue();

47422

47423

ShrinkMode Mode;

47424

if (!canReduceVMulWidth(N, DAG, Mode))

47425

return SDValue();

47426

47427

SDLoc DL(N);

47428

SDValue N0 = N->getOperand(0);

47429

SDValue N1 = N->getOperand(1);

47430

EVT VT = N->getOperand(0).getValueType();

47431

unsigned NumElts = VT.getVectorNumElements();

47432

if ((NumElts % 2) != 0)

47433

return SDValue();

47434

47435

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

47436

47437

// Shrink the operands of mul.

47438

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

47439

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

47440

47441

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

47442

// lower part is needed.

47443

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

47444

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

47445

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

47446

: ISD::SIGN_EXTEND,

47447

DL, VT, MulLo);

47448

47449

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

47450

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

47451

// the higher part is also needed.

47452

SDValue MulHi =

47453

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

47454

ReducedVT, NewN0, NewN1);

47455

47456

// Repack the lower part and higher part result of mul into a wider

47457

// result.

47458

// Generate shuffle functioning as punpcklwd.

47459

SmallVector<int, 16> ShuffleMask(NumElts);

47460

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

47461

ShuffleMask[2 * i] = i;

47462

ShuffleMask[2 * i + 1] = i + NumElts;

47463

}

47464

SDValue ResLo =

47465

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

47466

ResLo = DAG.getBitcast(ResVT, ResLo);

47467

// Generate shuffle functioning as punpckhwd.

47468

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

47469

ShuffleMask[2 * i] = i + NumElts / 2;

47470

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

47471

}

47472

SDValue ResHi =

47473

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

47474

ResHi = DAG.getBitcast(ResVT, ResHi);

47475

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

47476

}

47477

47478

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

47479

EVT VT, const SDLoc &DL) {

47480

47481

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

47482

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47483

DAG.getConstant(Mult, DL, VT));

47484

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

47485

DAG.getConstant(Shift, DL, MVT::i8));

47486

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

47487

N->getOperand(0));

47488

return Result;

47489

};

47490

47491

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

47492

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47493

DAG.getConstant(Mul1, DL, VT));

47494

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

47495

DAG.getConstant(Mul2, DL, VT));

47496

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

47497

N->getOperand(0));

47498

return Result;

47499

};

47500

47501

switch (MulAmt) {

47502

default:

47503

break;

47504

case 11:

47505

// mul x, 11 => add ((shl (mul x, 5), 1), x)

47506

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

47507

case 21:

47508

// mul x, 21 => add ((shl (mul x, 5), 2), x)

47509

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

47510

case 41:

47511

// mul x, 41 => add ((shl (mul x, 5), 3), x)

47512

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

47513

case 22:

47514

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

47515

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

47516

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

47517

case 19:

47518

// mul x, 19 => add ((shl (mul x, 9), 1), x)

47519

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

47520

case 37:

47521

// mul x, 37 => add ((shl (mul x, 9), 2), x)

47522

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

47523

case 73:

47524

// mul x, 73 => add ((shl (mul x, 9), 3), x)

47525

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

47526

case 13:

47527

// mul x, 13 => add ((shl (mul x, 3), 2), x)

47528

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

47529

case 23:

47530

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

47531

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

47532

case 26:

47533

// mul x, 26 => add ((mul (mul x, 5), 5), x)

47534

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

47535

case 28:

47536

// mul x, 28 => add ((mul (mul x, 9), 3), x)

47537

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

47538

case 29:

47539

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

47540

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

47541

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

47542

}

47543

47544

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

47545

// by a single LEA.

47546

// First check if this a sum of two power of 2s because that's easy. Then

47547

// count how many zeros are up to the first bit.

47548

// TODO: We can do this even without LEA at a cost of two shifts and an add.

47549

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

47550

unsigned ScaleShift = llvm::countr_zero(MulAmt);

47551

if (ScaleShift >= 1 && ScaleShift < 4) {

47552

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

47553

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47554

DAG.getConstant(ShiftAmt, DL, MVT::i8));

47555

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47556

DAG.getConstant(ScaleShift, DL, MVT::i8));

47557

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

47558

}

47559

}

47560

47561

return SDValue();

47562

}

47563

47564

// If the upper 17 bits of either element are zero and the other element are

47565

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

47566

// PMULLD, except on KNL.

47567

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

47568

const X86Subtarget &Subtarget) {

47569

if (!Subtarget.hasSSE2())

47570

return SDValue();

47571

47572

if (Subtarget.isPMADDWDSlow())

47573

return SDValue();

47574

47575

EVT VT = N->getValueType(0);

47576

47577

// Only support vXi32 vectors.

47578

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

47579

return SDValue();

47580

47581

// Make sure the type is legal or can split/widen to a legal type.

47582

// With AVX512 but without BWI, we would need to split v32i16.

47583

unsigned NumElts = VT.getVectorNumElements();

47584

if (NumElts == 1 || !isPowerOf2_32(NumElts))

47585

return SDValue();

47586

47587

// With AVX512 but without BWI, we would need to split v32i16.

47588

if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

47589

return SDValue();

47590

47591

SDValue N0 = N->getOperand(0);

47592

SDValue N1 = N->getOperand(1);

47593

47594

// If we are zero/sign extending two steps without SSE4.1, its better to

47595

// reduce the vmul width instead.

47596

if (!Subtarget.hasSSE41() &&

47597

(((N0.getOpcode() == ISD::ZERO_EXTEND &&

47598

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

47599

(N1.getOpcode() == ISD::ZERO_EXTEND &&

47600

N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

47601

((N0.getOpcode() == ISD::SIGN_EXTEND &&

47602

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

47603

(N1.getOpcode() == ISD::SIGN_EXTEND &&

47604

N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

47605

return SDValue();

47606

47607

// If we are sign extending a wide vector without SSE4.1, its better to reduce

47608

// the vmul width instead.

47609

if (!Subtarget.hasSSE41() &&

47610

(N0.getOpcode() == ISD::SIGN_EXTEND &&

47611

N0.getOperand(0).getValueSizeInBits() > 128) &&

47612

(N1.getOpcode() == ISD::SIGN_EXTEND &&

47613

N1.getOperand(0).getValueSizeInBits() > 128))

47614

return SDValue();

47615

47616

// Sign bits must extend down to the lowest i16.

47617

if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

47618

DAG.ComputeMaxSignificantBits(N0) > 16)

47619

return SDValue();

47620

47621

// At least one of the elements must be zero in the upper 17 bits, or can be

47622

// safely made zero without altering the final result.

47623

auto GetZeroableOp = [&](SDValue Op) {

47624

APInt Mask17 = APInt::getHighBitsSet(32, 17);

47625

if (DAG.MaskedValueIsZero(Op, Mask17))

47626

return Op;

47627

// Mask off upper 16-bits of sign-extended constants.

47628

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

47629

return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,

47630

DAG.getConstant(0xFFFF, SDLoc(N), VT));

47631

if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

47632

SDValue Src = Op.getOperand(0);

47633

// Convert sext(vXi16) to zext(vXi16).

47634

if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

47635

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

47636

// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

47637

// which will expand the extension.

47638

if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

47639

EVT ExtVT = VT.changeVectorElementType(MVT::i16);

47640

Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);

47641

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

47642

}

47643

}

47644

// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

47645

if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

47646

N->isOnlyUserOf(Op.getNode())) {

47647

SDValue Src = Op.getOperand(0);

47648

if (Src.getScalarValueSizeInBits() == 16)

47649

return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);

47650

}

47651

// Convert VSRAI(Op, 16) to VSRLI(Op, 16).

47652

if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

47653

N->isOnlyUserOf(Op.getNode())) {

47654

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),

47655

Op.getOperand(1));

47656

}

47657

return SDValue();

47658

};

47659

SDValue ZeroN0 = GetZeroableOp(N0);

47660

SDValue ZeroN1 = GetZeroableOp(N1);

47661

if (!ZeroN0 && !ZeroN1)

47662

return SDValue();

47663

N0 = ZeroN0 ? ZeroN0 : N0;

47664

N1 = ZeroN1 ? ZeroN1 : N1;

47665

47666

// Use SplitOpsAndApply to handle AVX splitting.

47667

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

47668

ArrayRef<SDValue> Ops) {

47669

MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

47670

MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);

47671

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

47672

DAG.getBitcast(OpVT, Ops[0]),

47673

DAG.getBitcast(OpVT, Ops[1]));

47674

};

47675

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},

47676

PMADDWDBuilder);

47677

}

47678

47679

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

47680

const X86Subtarget &Subtarget) {

47681

if (!Subtarget.hasSSE2())

47682

return SDValue();

47683

47684

EVT VT = N->getValueType(0);

47685

47686

// Only support vXi64 vectors.

47687

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

47688

VT.getVectorNumElements() < 2 ||

47689

!isPowerOf2_32(VT.getVectorNumElements()))

47690

return SDValue();

47691

47692

SDValue N0 = N->getOperand(0);

47693

SDValue N1 = N->getOperand(1);

47694

47695

// MULDQ returns the 64-bit result of the signed multiplication of the lower

47696

// 32-bits. We can lower with this if the sign bits stretch that far.

47697

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

47698

DAG.ComputeNumSignBits(N1) > 32) {

47699

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

47700

ArrayRef<SDValue> Ops) {

47701

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

47702

};

47703

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

47704

PMULDQBuilder, /*CheckBWI*/false);

47705

}

47706

47707

// If the upper bits are zero we can use a single pmuludq.

47708

APInt Mask = APInt::getHighBitsSet(64, 32);

47709

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

47710

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

47711

ArrayRef<SDValue> Ops) {

47712

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

47713

};

47714

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

47715

PMULUDQBuilder, /*CheckBWI*/false);

47716

}

47717

47718

return SDValue();

47719

}

47720

47721

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

47722

TargetLowering::DAGCombinerInfo &DCI,

47723

const X86Subtarget &Subtarget) {

47724

EVT VT = N->getValueType(0);

47725

47726

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

47727

return V;

47728

47729

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

47730

return V;

47731

47732

if (DCI.isBeforeLegalize() && VT.isVector())

47733

return reduceVMULWidth(N, DAG, Subtarget);

47734

47735

// Optimize a single multiply with constant into two operations in order to

47736

// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

47737

if (!MulConstantOptimization)

47738

return SDValue();

47739

47740

// An imul is usually smaller than the alternative sequence.

47741

if (DAG.getMachineFunction().getFunction().hasMinSize())

47742

return SDValue();

47743

47744

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

47745

return SDValue();

47746

47747

if (VT != MVT::i64 && VT != MVT::i32)

47748

return SDValue();

47749

47750

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

47751

if (!C)

47752

return SDValue();

47753

if (isPowerOf2_64(C->getZExtValue()))

47754

return SDValue();

47755

47756

int64_t SignMulAmt = C->getSExtValue();

47757

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47757, __extension__
__PRETTY_FUNCTION__));

47758

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

47759

47760

SDLoc DL(N);

47761

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

47762

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47763

DAG.getConstant(AbsMulAmt, DL, VT));

47764

if (SignMulAmt < 0)

47765

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

47766

NewMul);

47767

47768

return NewMul;

47769

}

47770

47771

uint64_t MulAmt1 = 0;

47772

uint64_t MulAmt2 = 0;

47773

if ((AbsMulAmt % 9) == 0) {

47774

MulAmt1 = 9;

47775

MulAmt2 = AbsMulAmt / 9;

47776

} else if ((AbsMulAmt % 5) == 0) {

47777

MulAmt1 = 5;

47778

MulAmt2 = AbsMulAmt / 5;

47779

} else if ((AbsMulAmt % 3) == 0) {

47780

MulAmt1 = 3;

47781

MulAmt2 = AbsMulAmt / 3;

47782

}

47783

47784

SDValue NewMul;

47785

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

47786

if (MulAmt2 &&

47787

(isPowerOf2_64(MulAmt2) ||

47788

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

47789

47790

if (isPowerOf2_64(MulAmt2) &&

47791

!(SignMulAmt >= 0 && N->hasOneUse() &&

47792

N->use_begin()->getOpcode() == ISD::ADD))

47793

// If second multiplifer is pow2, issue it first. We want the multiply by

47794

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

47795

// is an add. Only do this for positive multiply amounts since the

47796

// negate would prevent it from being used as an address mode anyway.

47797

std::swap(MulAmt1, MulAmt2);

47798

47799

if (isPowerOf2_64(MulAmt1))

47800

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47801

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

47802

else

47803

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47804

DAG.getConstant(MulAmt1, DL, VT));

47805

47806

if (isPowerOf2_64(MulAmt2))

47807

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

47808

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

47809

else

47810

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

47811

DAG.getConstant(MulAmt2, DL, VT));

47812

47813

// Negate the result.

47814

if (SignMulAmt < 0)

47815

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

47816

NewMul);

47817

} else if (!Subtarget.slowLEA())

47818

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

47819

47820

if (!NewMul) {

47821

assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47824, __extension__
__PRETTY_FUNCTION__))

47822

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47824, __extension__
__PRETTY_FUNCTION__))

47823

"Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47824, __extension__
__PRETTY_FUNCTION__))

47824

"already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47824, __extension__
__PRETTY_FUNCTION__));

47825

if (isPowerOf2_64(AbsMulAmt - 1)) {

47826

// (mul x, 2^N + 1) => (add (shl x, N), x)

47827

NewMul = DAG.getNode(

47828

ISD::ADD, DL, VT, N->getOperand(0),

47829

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47830

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

47831

MVT::i8)));

47832

// To negate, subtract the number from zero

47833

if (SignMulAmt < 0)

47834

NewMul = DAG.getNode(ISD::SUB, DL, VT,

47835

DAG.getConstant(0, DL, VT), NewMul);

47836

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

47837

// (mul x, 2^N - 1) => (sub (shl x, N), x)

47838

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47839

DAG.getConstant(Log2_64(AbsMulAmt + 1),

47840

DL, MVT::i8));

47841

// To negate, reverse the operands of the subtract.

47842

if (SignMulAmt < 0)

47843

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

47844

else

47845

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

47846

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

47847

// (mul x, 2^N + 2) => (add (shl x, N), (add x, x))

47848

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47849

DAG.getConstant(Log2_64(AbsMulAmt - 2),

47850

DL, MVT::i8));

47851

NewMul = DAG.getNode(

47852

ISD::ADD, DL, VT, NewMul,

47853

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

47854

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

47855

// (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))

47856

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47857

DAG.getConstant(Log2_64(AbsMulAmt + 2),

47858

DL, MVT::i8));

47859

NewMul = DAG.getNode(

47860

ISD::SUB, DL, VT, NewMul,

47861

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

47862

}

47863

}

47864

47865

return NewMul;

47866

}

47867

47868

// Try to form a MULHU or MULHS node by looking for

47869

// (srl (mul ext, ext), 16)

47870

// TODO: This is X86 specific because we want to be able to handle wide types

47871

// before type legalization. But we can only do it if the vector will be

47872

// legalized via widening/splitting. Type legalization can't handle promotion

47873

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

47874

// combiner.

47875

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

47876

const X86Subtarget &Subtarget) {

47877

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47878, __extension__
__PRETTY_FUNCTION__))

47878

"SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47878, __extension__
__PRETTY_FUNCTION__));

47879

SDLoc DL(N);

47880

47881

if (!Subtarget.hasSSE2())

47882

return SDValue();

47883

47884

// The operation feeding into the shift must be a multiply.

47885

SDValue ShiftOperand = N->getOperand(0);

47886

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

47887

return SDValue();

47888

47889

// Input type should be at least vXi32.

47890

EVT VT = N->getValueType(0);

47891

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

47892

return SDValue();

47893

47894

// Need a shift by 16.

47895

APInt ShiftAmt;

47896

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

47897

ShiftAmt != 16)

47898

return SDValue();

47899

47900

SDValue LHS = ShiftOperand.getOperand(0);

47901

SDValue RHS = ShiftOperand.getOperand(1);

47902

47903

unsigned ExtOpc = LHS.getOpcode();

47904

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

47905

RHS.getOpcode() != ExtOpc)

47906

return SDValue();

47907

47908

// Peek through the extends.

47909

LHS = LHS.getOperand(0);

47910

RHS = RHS.getOperand(0);

47911

47912

// Ensure the input types match.

47913

EVT MulVT = LHS.getValueType();

47914

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

47915

return SDValue();

47916

47917

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

47918

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

47919

47920

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

47921

return DAG.getNode(ExtOpc, DL, VT, Mulh);

47922

}

47923

47924

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

47925

SDValue N0 = N->getOperand(0);

47926

SDValue N1 = N->getOperand(1);

47927

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

47928

EVT VT = N0.getValueType();

47929

47930

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

47931

// since the result of setcc_c is all zero's or all ones.

47932

if (VT.isInteger() && !VT.isVector() &&

47933

N1C && N0.getOpcode() == ISD::AND &&

47934

N0.getOperand(1).getOpcode() == ISD::Constant) {

47935

SDValue N00 = N0.getOperand(0);

47936

APInt Mask = N0.getConstantOperandAPInt(1);

47937

Mask <<= N1C->getAPIntValue();

47938

bool MaskOK = false;

47939

// We can handle cases concerning bit-widening nodes containing setcc_c if

47940

// we carefully interrogate the mask to make sure we are semantics

47941

// preserving.

47942

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

47943

// of the underlying setcc_c operation if the setcc_c was zero extended.

47944

// Consider the following example:

47945

// zext(setcc_c) -> i32 0x0000FFFF

47946

// c1 -> i32 0x0000FFFF

47947

// c2 -> i32 0x00000001

47948

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

47949

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

47950

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

47951

MaskOK = true;

47952

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

47953

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

47954

MaskOK = true;

47955

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

47956

N00.getOpcode() == ISD::ANY_EXTEND) &&

47957

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

47958

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

47959

}

47960

if (MaskOK && Mask != 0) {

47961

SDLoc DL(N);

47962

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

47963

}

47964

}

47965

47966

return SDValue();

47967

}

47968

47969

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

47970

const X86Subtarget &Subtarget) {

47971

SDValue N0 = N->getOperand(0);

47972

SDValue N1 = N->getOperand(1);

47973

EVT VT = N0.getValueType();

47974

unsigned Size = VT.getSizeInBits();

47975

47976

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

47977

return V;

47978

47979

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

47980

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

47981

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

47982

// depending on sign of (SarConst - [56,48,32,24,16])

47983

47984

// sexts in X86 are MOVs. The MOVs have the same code size

47985

// as above SHIFTs (only SHIFT on 1 has lower code size).

47986

// However the MOVs have 2 advantages to a SHIFT:

47987

// 1. MOVs can write to a register that differs from source

47988

// 2. MOVs accept memory operands

47989

47990

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

47991

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

47992

N0.getOperand(1).getOpcode() != ISD::Constant)

47993

return SDValue();

47994

47995

SDValue N00 = N0.getOperand(0);

47996

SDValue N01 = N0.getOperand(1);

47997

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

47998

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

47999

EVT CVT = N1.getValueType();

48000

48001

if (SarConst.isNegative())

48002

return SDValue();

48003

48004

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

48005

unsigned ShiftSize = SVT.getSizeInBits();

48006

// skipping types without corresponding sext/zext and

48007

// ShlConst that is not one of [56,48,32,24,16]

48008

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

48009

continue;

48010

SDLoc DL(N);

48011

SDValue NN =

48012

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

48013

SarConst = SarConst - (Size - ShiftSize);

48014

if (SarConst == 0)

48015

return NN;

48016

if (SarConst.isNegative())

48017

return DAG.getNode(ISD::SHL, DL, VT, NN,

48018

DAG.getConstant(-SarConst, DL, CVT));

48019

return DAG.getNode(ISD::SRA, DL, VT, NN,

48020

DAG.getConstant(SarConst, DL, CVT));

48021

}

48022

return SDValue();

48023

}

48024

48025

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

48026

TargetLowering::DAGCombinerInfo &DCI,

48027

const X86Subtarget &Subtarget) {

48028

SDValue N0 = N->getOperand(0);

48029

SDValue N1 = N->getOperand(1);

48030

EVT VT = N0.getValueType();

48031

48032

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48033

return V;

48034

48035

// Only do this on the last DAG combine as it can interfere with other

48036

// combines.

48037

if (!DCI.isAfterLegalizeDAG())

48038

return SDValue();

48039

48040

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

48041

// TODO: This is a generic DAG combine that became an x86-only combine to

48042

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

48043

// and-not ('andn').

48044

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

48045

return SDValue();

48046

48047

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

48048

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

48049

if (!ShiftC || !AndC)

48050

return SDValue();

48051

48052

// If we can shrink the constant mask below 8-bits or 32-bits, then this

48053

// transform should reduce code size. It may also enable secondary transforms

48054

// from improved known-bits analysis or instruction selection.

48055

APInt MaskVal = AndC->getAPIntValue();

48056

48057

// If this can be matched by a zero extend, don't optimize.

48058

if (MaskVal.isMask()) {

48059

unsigned TO = MaskVal.countr_one();

48060

if (TO >= 8 && isPowerOf2_32(TO))

48061

return SDValue();

48062

}

48063

48064

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

48065

unsigned OldMaskSize = MaskVal.getSignificantBits();

48066

unsigned NewMaskSize = NewMaskVal.getSignificantBits();

48067

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

48068

(OldMaskSize > 32 && NewMaskSize <= 32)) {

48069

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

48070

SDLoc DL(N);

48071

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

48072

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

48073

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

48074

}

48075

return SDValue();

48076

}

48077

48078

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

48079

const X86Subtarget &Subtarget) {

48080

unsigned Opcode = N->getOpcode();

48081

assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48081, __extension__
__PRETTY_FUNCTION__));

48082

48083

SDLoc DL(N);

48084

EVT VT = N->getValueType(0);

48085

SDValue N0 = N->getOperand(0);

48086

SDValue N1 = N->getOperand(1);

48087

EVT SrcVT = N0.getValueType();

48088

48089

SDValue BC0 =

48090

N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

48091

SDValue BC1 =

48092

N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;

48093

48094

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

48095

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

48096

// truncation trees that help us avoid lane crossing shuffles.

48097

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

48098

// TODO: We don't handle vXf64 shuffles yet.

48099

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48100

if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

48101

SmallVector<SDValue> ShuffleOps;

48102

SmallVector<int> ShuffleMask, ScaledMask;

48103

SDValue Vec = peekThroughBitcasts(BCSrc);

48104

if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

48105

resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

48106

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

48107

// shuffle to a v4X64 width - we can probably relax this in the future.

48108

if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

48109

ShuffleOps[0].getValueType().is256BitVector() &&

48110

scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

48111

SDValue Lo, Hi;

48112

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48113

std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

48114

Lo = DAG.getBitcast(SrcVT, Lo);

48115

Hi = DAG.getBitcast(SrcVT, Hi);

48116

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

48117

Res = DAG.getBitcast(ShufVT, Res);

48118

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

48119

return DAG.getBitcast(VT, Res);

48120

}

48121

}

48122

}

48123

}

48124

48125

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

48126

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48127

// If either/both ops are a shuffle that can scale to v2x64,

48128

// then see if we can perform this as a v4x32 post shuffle.

48129

SmallVector<SDValue> Ops0, Ops1;

48130

SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

48131

bool IsShuf0 =

48132

getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48133

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48134

all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48135

bool IsShuf1 =

48136

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48137

scaleShuffleElements(Mask1, 2, ScaledMask1) &&

48138

all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48139

if (IsShuf0 || IsShuf1) {

48140

if (!IsShuf0) {

48141

Ops0.assign({BC0});

48142

ScaledMask0.assign({0, 1});

48143

}

48144

if (!IsShuf1) {

48145

Ops1.assign({BC1});

48146

ScaledMask1.assign({0, 1});

48147

}

48148

48149

SDValue LHS, RHS;

48150

int PostShuffle[4] = {-1, -1, -1, -1};

48151

auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

48152

if (M < 0)

48153

return true;

48154

Idx = M % 2;

48155

SDValue Src = Ops[M / 2];

48156

if (!LHS || LHS == Src) {

48157

LHS = Src;

48158

return true;

48159

}

48160

if (!RHS || RHS == Src) {

48161

Idx += 2;

48162

RHS = Src;

48163

return true;

48164

}

48165

return false;

48166

};

48167

if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

48168

FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

48169

FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

48170

FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

48171

LHS = DAG.getBitcast(SrcVT, LHS);

48172

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

48173

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48174

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

48175

Res = DAG.getBitcast(ShufVT, Res);

48176

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

48177

return DAG.getBitcast(VT, Res);

48178

}

48179

}

48180

}

48181

48182

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

48183

if (VT.is256BitVector() && Subtarget.hasInt256()) {

48184

SmallVector<int> Mask0, Mask1;

48185

SmallVector<SDValue> Ops0, Ops1;

48186

SmallVector<int, 2> ScaledMask0, ScaledMask1;

48187

if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48188

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48189

!Ops0.empty() && !Ops1.empty() &&

48190

all_of(Ops0,

48191

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

48192

all_of(Ops1,

48193

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

48194

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48195

scaleShuffleElements(Mask1, 2, ScaledMask1)) {

48196

SDValue Op00 = peekThroughBitcasts(Ops0.front());

48197

SDValue Op10 = peekThroughBitcasts(Ops1.front());

48198

SDValue Op01 = peekThroughBitcasts(Ops0.back());

48199

SDValue Op11 = peekThroughBitcasts(Ops1.back());

48200

if ((Op00 == Op11) && (Op01 == Op10)) {

48201

std::swap(Op10, Op11);

48202

ShuffleVectorSDNode::commuteMask(ScaledMask1);

48203

}

48204

if ((Op00 == Op10) && (Op01 == Op11)) {

48205

const int Map[4] = {0, 2, 1, 3};

48206

SmallVector<int, 4> ShuffleMask(

48207

{Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

48208

Map[ScaledMask1[1]]});

48209

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

48210

SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

48211

DAG.getBitcast(SrcVT, Op01));

48212

Res = DAG.getBitcast(ShufVT, Res);

48213

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

48214

return DAG.getBitcast(VT, Res);

48215

}

48216

}

48217

}

48218

48219

return SDValue();

48220

}

48221

48222

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

48223

TargetLowering::DAGCombinerInfo &DCI,

48224

const X86Subtarget &Subtarget) {

48225

unsigned Opcode = N->getOpcode();

48226

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48227, __extension__
__PRETTY_FUNCTION__))

48227

"Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48227, __extension__
__PRETTY_FUNCTION__));

48228

48229

EVT VT = N->getValueType(0);

48230

SDValue N0 = N->getOperand(0);

48231

SDValue N1 = N->getOperand(1);

48232

unsigned NumDstElts = VT.getVectorNumElements();

48233

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

48234

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

48235

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48237, __extension__
__PRETTY_FUNCTION__))

48236

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48237, __extension__
__PRETTY_FUNCTION__))

48237

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48237, __extension__
__PRETTY_FUNCTION__));

48238

48239

bool IsSigned = (X86ISD::PACKSS == Opcode);

48240

48241

// Constant Folding.

48242

APInt UndefElts0, UndefElts1;

48243

SmallVector<APInt, 32> EltBits0, EltBits1;

48244

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

48245

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

48246

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

48247

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

48248

unsigned NumLanes = VT.getSizeInBits() / 128;

48249

unsigned NumSrcElts = NumDstElts / 2;

48250

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

48251

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

48252

48253

APInt Undefs(NumDstElts, 0);

48254

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

48255

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

48256

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

48257

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

48258

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

48259

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

48260

48261

if (UndefElts[SrcIdx]) {

48262

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

48263

continue;

48264

}

48265

48266

APInt &Val = EltBits[SrcIdx];

48267

if (IsSigned) {

48268

// PACKSS: Truncate signed value with signed saturation.

48269

// Source values less than dst minint are saturated to minint.

48270

// Source values greater than dst maxint are saturated to maxint.

48271

if (Val.isSignedIntN(DstBitsPerElt))

48272

Val = Val.trunc(DstBitsPerElt);

48273

else if (Val.isNegative())

48274

Val = APInt::getSignedMinValue(DstBitsPerElt);

48275

else

48276

Val = APInt::getSignedMaxValue(DstBitsPerElt);

48277

} else {

48278

// PACKUS: Truncate signed value with unsigned saturation.

48279

// Source values less than zero are saturated to zero.

48280

// Source values greater than dst maxuint are saturated to maxuint.

48281

if (Val.isIntN(DstBitsPerElt))

48282

Val = Val.trunc(DstBitsPerElt);

48283

else if (Val.isNegative())

48284

Val = APInt::getZero(DstBitsPerElt);

48285

else

48286

Val = APInt::getAllOnes(DstBitsPerElt);

48287

}

48288

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

48289

}

48290

}

48291

48292

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

48293

}

48294

48295

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

48296

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48297

return V;

48298

48299

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

48300

// truncate to create a larger truncate.

48301

if (Subtarget.hasAVX512() &&

48302

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

48303

N0.getOperand(0).getValueType() == MVT::v8i32) {

48304

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

48305

(!IsSigned &&

48306

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

48307

if (Subtarget.hasVLX())

48308

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

48309

48310

// Widen input to v16i32 so we can truncate that.

48311

SDLoc dl(N);

48312

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

48313

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

48314

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

48315

}

48316

}

48317

48318

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

48319

if (VT.is128BitVector()) {

48320

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48321

SDValue Src0, Src1;

48322

if (N0.getOpcode() == ExtOpc &&

48323

N0.getOperand(0).getValueType().is64BitVector() &&

48324

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48325

Src0 = N0.getOperand(0);

48326

}

48327

if (N1.getOpcode() == ExtOpc &&

48328

N1.getOperand(0).getValueType().is64BitVector() &&

48329

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48330

Src1 = N1.getOperand(0);

48331

}

48332

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

48333

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48333, __extension__
__PRETTY_FUNCTION__));

48334

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

48335

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

48336

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

48337

}

48338

48339

// Try again with pack(*_extend_vector_inreg, undef).

48340

unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

48341

: ISD::ZERO_EXTEND_VECTOR_INREG;

48342

if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

48343

N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

48344

return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

48345

DAG);

48346

}

48347

48348

// Attempt to combine as shuffle.

48349

SDValue Op(N, 0);

48350

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48351

return Res;

48352

48353

return SDValue();

48354

}

48355

48356

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

48357

TargetLowering::DAGCombinerInfo &DCI,

48358

const X86Subtarget &Subtarget) {

48359

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48361, __extension__
__PRETTY_FUNCTION__))

48360

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48361, __extension__
__PRETTY_FUNCTION__))

48361

"Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48361, __extension__
__PRETTY_FUNCTION__));

48362

48363

if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

48364

MVT VT = N->getSimpleValueType(0);

48365

SDValue LHS = N->getOperand(0);

48366

SDValue RHS = N->getOperand(1);

48367

48368

// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

48369

if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

48370

LHS.getOpcode() == RHS.getOpcode() &&

48371

LHS.getValueType() == RHS.getValueType() &&

48372

N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

48373

SDValue LHS0 = LHS.getOperand(0);

48374

SDValue LHS1 = LHS.getOperand(1);

48375

SDValue RHS0 = RHS.getOperand(0);

48376

SDValue RHS1 = RHS.getOperand(1);

48377

if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

48378

(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

48379

SDLoc DL(N);

48380

SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

48381

LHS0.isUndef() ? LHS1 : LHS0,

48382

RHS0.isUndef() ? RHS1 : RHS0);

48383

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

48384

Res = DAG.getBitcast(ShufVT, Res);

48385

SDValue NewLHS =

48386

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48387

getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

48388

SDValue NewRHS =

48389

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48390

getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

48391

return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

48392

DAG.getBitcast(VT, NewRHS));

48393

}

48394

}

48395

}

48396

48397

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

48398

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48399

return V;

48400

48401

return SDValue();

48402

}

48403

48404

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

48405

TargetLowering::DAGCombinerInfo &DCI,

48406

const X86Subtarget &Subtarget) {

48407

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48409, __extension__
__PRETTY_FUNCTION__))

48408

X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48409, __extension__
__PRETTY_FUNCTION__))

48409

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48409, __extension__
__PRETTY_FUNCTION__));

48410

EVT VT = N->getValueType(0);

48411

SDValue N0 = N->getOperand(0);

48412

SDValue N1 = N->getOperand(1);

48413

48414

// Shift zero -> zero.

48415

if (ISD::isBuildVectorAllZeros(N0.getNode()))

48416

return DAG.getConstant(0, SDLoc(N), VT);

48417

48418

// Detect constant shift amounts.

48419

APInt UndefElts;

48420

SmallVector<APInt, 32> EltBits;

48421

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

48422

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

48423

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

48424

EltBits[0].getZExtValue(), DAG);

48425

}

48426

48427

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48428

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

48429

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

48430

return SDValue(N, 0);

48431

48432

return SDValue();

48433

}

48434

48435

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

48436

TargetLowering::DAGCombinerInfo &DCI,

48437

const X86Subtarget &Subtarget) {

48438

unsigned Opcode = N->getOpcode();

48439

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48441, __extension__
__PRETTY_FUNCTION__))

48440

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48441, __extension__
__PRETTY_FUNCTION__))

48441

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48441, __extension__
__PRETTY_FUNCTION__));

48442

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

48443

EVT VT = N->getValueType(0);

48444

SDValue N0 = N->getOperand(0);

48445

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

48446

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48447, __extension__
__PRETTY_FUNCTION__))

48447

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48447, __extension__
__PRETTY_FUNCTION__));

48448

assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48449, __extension__
__PRETTY_FUNCTION__))

48449

"Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48449, __extension__
__PRETTY_FUNCTION__));

48450

48451

// (shift undef, X) -> 0

48452

if (N0.isUndef())

48453

return DAG.getConstant(0, SDLoc(N), VT);

48454

48455

// Out of range logical bit shifts are guaranteed to be zero.

48456

// Out of range arithmetic bit shifts splat the sign bit.

48457

unsigned ShiftVal = N->getConstantOperandVal(1);

48458

if (ShiftVal >= NumBitsPerElt) {

48459

if (LogicalShift)

48460

return DAG.getConstant(0, SDLoc(N), VT);

48461

ShiftVal = NumBitsPerElt - 1;

48462

}

48463

48464

// (shift X, 0) -> X

48465

if (!ShiftVal)

48466

return N0;

48467

48468

// (shift 0, C) -> 0

48469

if (ISD::isBuildVectorAllZeros(N0.getNode()))

48470

// N0 is all zeros or undef. We guarantee that the bits shifted into the

48471

// result are all zeros, not undef.

48472

return DAG.getConstant(0, SDLoc(N), VT);

48473

48474

// (VSRAI -1, C) -> -1

48475

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

48476

// N0 is all ones or undef. We guarantee that the bits shifted into the

48477

// result are all ones, not undef.

48478

return DAG.getConstant(-1, SDLoc(N), VT);

48479

48480

auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {

48481

unsigned NewShiftVal = Amt0 + Amt1;

48482

if (NewShiftVal >= NumBitsPerElt) {

48483

// Out of range logical bit shifts are guaranteed to be zero.

48484

// Out of range arithmetic bit shifts splat the sign bit.

48485

if (LogicalShift)

48486

return DAG.getConstant(0, SDLoc(N), VT);

48487

NewShiftVal = NumBitsPerElt - 1;

48488

}

48489

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

48490

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

48491

};

48492

48493

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

48494

if (Opcode == N0.getOpcode())

48495

return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));

48496

48497

// (shl (add X, X), C) -> (shl X, (C + 1))

48498

if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&

48499

N0.getOperand(0) == N0.getOperand(1))

48500

return MergeShifts(N0.getOperand(0), ShiftVal, 1);

48501

48502

// We can decode 'whole byte' logical bit shifts as shuffles.

48503

if (LogicalShift && (ShiftVal % 8) == 0) {

48504

SDValue Op(N, 0);

48505

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48506

return Res;

48507

}

48508

48509

// Constant Folding.

48510

APInt UndefElts;

48511

SmallVector<APInt, 32> EltBits;

48512

if (N->isOnlyUserOf(N0.getNode()) &&

48513

getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {

48514

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48515, __extension__
__PRETTY_FUNCTION__))

48515

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48515, __extension__
__PRETTY_FUNCTION__));

48516

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

48517

// created an undef input due to no input bits being demanded, but user

48518

// still expects 0 in other bits.

48519

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

48520

APInt &Elt = EltBits[i];

48521

if (UndefElts[i])

48522

Elt = 0;

48523

else if (X86ISD::VSHLI == Opcode)

48524

Elt <<= ShiftVal;

48525

else if (X86ISD::VSRAI == Opcode)

48526

Elt.ashrInPlace(ShiftVal);

48527

else

48528

Elt.lshrInPlace(ShiftVal);

48529

}

48530

// Reset undef elements since they were zeroed above.

48531

UndefElts = 0;

48532

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

48533

}

48534

48535

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48536

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

48537

DCI))

48538

return SDValue(N, 0);

48539

48540

return SDValue();

48541

}

48542

48543

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

48544

TargetLowering::DAGCombinerInfo &DCI,

48545

const X86Subtarget &Subtarget) {

48546

EVT VT = N->getValueType(0);

48547

unsigned Opcode = N->getOpcode();

48548

assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48551, __extension__
__PRETTY_FUNCTION__))

48549

(Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48551, __extension__
__PRETTY_FUNCTION__))

48550

Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48551, __extension__
__PRETTY_FUNCTION__))

48551

"Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48551, __extension__
__PRETTY_FUNCTION__));

48552

48553

SDValue Vec = N->getOperand(0);

48554

SDValue Scl = N->getOperand(1);

48555

SDValue Idx = N->getOperand(2);

48556

48557

// Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).

48558

if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))

48559

return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);

48560

48561

if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {

48562

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

48563

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48564

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

48565

APInt::getAllOnes(NumBitsPerElt), DCI))

48566

return SDValue(N, 0);

48567

}

48568

48569

// Attempt to combine insertion patterns to a shuffle.

48570

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

48571

SDValue Op(N, 0);

48572

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48573

return Res;

48574

}

48575

48576

return SDValue();

48577

}

48578

48579

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

48580

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

48581

/// OR -> CMPNEQSS.

48582

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

48583

TargetLowering::DAGCombinerInfo &DCI,

48584

const X86Subtarget &Subtarget) {

48585

unsigned opcode;

48586

48587

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

48588

// we're requiring SSE2 for both.

48589

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

48590

SDValue N0 = N->getOperand(0);

48591

SDValue N1 = N->getOperand(1);

48592

SDValue CMP0 = N0.getOperand(1);

48593

SDValue CMP1 = N1.getOperand(1);

48594

SDLoc DL(N);

48595

48596

// The SETCCs should both refer to the same CMP.

48597

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

48598

return SDValue();

48599

48600

SDValue CMP00 = CMP0->getOperand(0);

48601

SDValue CMP01 = CMP0->getOperand(1);

48602

EVT VT = CMP00.getValueType();

48603

48604

if (VT == MVT::f32 || VT == MVT::f64 ||

48605

(VT == MVT::f16 && Subtarget.hasFP16())) {

48606

bool ExpectingFlags = false;

48607

// Check for any users that want flags:

48608

for (const SDNode *U : N->uses()) {

48609

if (ExpectingFlags)

48610

break;

48611

48612

switch (U->getOpcode()) {

48613

default:

48614

case ISD::BR_CC:

48615

case ISD::BRCOND:

48616

case ISD::SELECT:

48617

ExpectingFlags = true;

48618

break;

48619

case ISD::CopyToReg:

48620

case ISD::SIGN_EXTEND:

48621

case ISD::ZERO_EXTEND:

48622

case ISD::ANY_EXTEND:

48623

break;

48624

}

48625

}

48626

48627

if (!ExpectingFlags) {

48628

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

48629

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

48630

48631

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

48632

X86::CondCode tmp = cc0;

48633

cc0 = cc1;

48634

cc1 = tmp;

48635

}

48636

48637

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

48638

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

48639

// FIXME: need symbolic constants for these magic numbers.

48640

// See X86ATTInstPrinter.cpp:printSSECC().

48641

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

48642

if (Subtarget.hasAVX512()) {

48643

SDValue FSetCC =

48644

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

48645

DAG.getTargetConstant(x86cc, DL, MVT::i8));

48646

// Need to fill with zeros to ensure the bitcast will produce zeroes

48647

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

48648

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

48649

DAG.getConstant(0, DL, MVT::v16i1),

48650

FSetCC, DAG.getIntPtrConstant(0, DL));

48651

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

48652

N->getSimpleValueType(0));

48653

}

48654

SDValue OnesOrZeroesF =

48655

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

48656

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

48657

48658

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

48659

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

48660

48661

if (is64BitFP && !Subtarget.is64Bit()) {

48662

// On a 32-bit target, we cannot bitcast the 64-bit float to a

48663

// 64-bit integer, since that's not a legal type. Since

48664

// OnesOrZeroesF is all ones or all zeroes, we don't need all the

48665

// bits, but can do this little dance to extract the lowest 32 bits

48666

// and work with those going forward.

48667

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

48668

OnesOrZeroesF);

48669

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

48670

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

48671

Vector32, DAG.getIntPtrConstant(0, DL));

48672

IntVT = MVT::i32;

48673

}

48674

48675

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

48676

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

48677

DAG.getConstant(1, DL, IntVT));

48678

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

48679

ANDed);

48680

return OneBitOfTruth;

48681

}

48682

}

48683

}

48684

}

48685

return SDValue();

48686

}

48687

48688

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

48689

static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {

48690

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48690, __extension__
__PRETTY_FUNCTION__));

48691

48692

MVT VT = N->getSimpleValueType(0);

48693

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

48694

return SDValue();

48695

48696

SDValue X, Y;

48697

SDValue N0 = N->getOperand(0);

48698

SDValue N1 = N->getOperand(1);

48699

48700

if (SDValue Not = IsNOT(N0, DAG)) {

48701

X = Not;

48702

Y = N1;

48703

} else if (SDValue Not = IsNOT(N1, DAG)) {

48704

X = Not;

48705

Y = N0;

48706

} else

48707

return SDValue();

48708

48709

X = DAG.getBitcast(VT, X);

48710

Y = DAG.getBitcast(VT, Y);

48711

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

48712

}

48713

48714

/// Try to fold:

48715

/// and (vector_shuffle<Z,...,Z>

48716

/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y

48717

/// ->

48718

/// andnp (vector_shuffle<Z,...,Z>

48719

/// (insert_vector_elt undef, X, Z), undef), Y

48720

static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

48721

const X86Subtarget &Subtarget) {

48722

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48722, __extension__
__PRETTY_FUNCTION__));

48723

48724

EVT VT = N->getValueType(0);

48725

// Do not split 256 and 512 bit vectors with SSE2 as they overwrite original

48726

// value and require extra moves.

48727

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

48728

((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))

48729

return SDValue();

48730

48731

auto GetNot = [&DAG](SDValue V) {

48732

auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));

48733

// TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all

48734

// end-users are ISD::AND including cases

48735

// (and(extract_vector_element(SVN), Y)).

48736

if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||

48737

!SVN->getOperand(1).isUndef()) {

48738

return SDValue();

48739

}

48740

SDValue IVEN = SVN->getOperand(0);

48741

if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||

48742

!IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())

48743

return SDValue();

48744

if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||

48745

IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())

48746

return SDValue();

48747

SDValue Src = IVEN.getOperand(1);

48748

if (SDValue Not = IsNOT(Src, DAG)) {

48749

SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);

48750

SDValue NotIVEN =

48751

DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),

48752

IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));

48753

return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,

48754

SVN->getOperand(1), SVN->getMask());

48755

}

48756

return SDValue();

48757

};

48758

48759

SDValue X, Y;

48760

SDValue N0 = N->getOperand(0);

48761

SDValue N1 = N->getOperand(1);

48762

48763

if (SDValue Not = GetNot(N0)) {

48764

X = Not;

48765

Y = N1;

48766

} else if (SDValue Not = GetNot(N1)) {

48767

X = Not;

48768

Y = N0;

48769

} else

48770

return SDValue();

48771

48772

X = DAG.getBitcast(VT, X);

48773

Y = DAG.getBitcast(VT, Y);

48774

SDLoc DL(N);

48775

// We do not split for SSE at all, but we need to split vectors for AVX1 and

48776

// AVX2.

48777

if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {

48778

SDValue LoX, HiX;

48779

std::tie(LoX, HiX) = splitVector(X, DAG, DL);

48780

SDValue LoY, HiY;

48781

std::tie(LoY, HiY) = splitVector(Y, DAG, DL);

48782

EVT SplitVT = LoX.getValueType();

48783

SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});

48784

SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});

48785

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});

48786

}

48787

return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});

48788

}

48789

48790

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

48791

// logical operations, like in the example below.

48792

// or (and (truncate x, truncate y)),

48793

// (xor (truncate z, build_vector (constants)))

48794

// Given a target type \p VT, we generate

48795

// or (and x, y), (xor z, zext(build_vector (constants)))

48796

// given x, y and z are of type \p VT. We can do so, if operands are either

48797

// truncates from VT types, the second operand is a vector of constants or can

48798

// be recursively promoted.

48799

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

48800

unsigned Depth) {

48801

// Limit recursion to avoid excessive compile times.

48802

if (Depth >= SelectionDAG::MaxRecursionDepth)

48803

return SDValue();

48804

48805

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

48806

N->getOpcode() != ISD::OR)

48807

return SDValue();

48808

48809

SDValue N0 = N->getOperand(0);

48810

SDValue N1 = N->getOperand(1);

48811

SDLoc DL(N);

48812

48813

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48814

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

48815

return SDValue();

48816

48817

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

48818

N0 = NN0;

48819

else {

48820

// The Left side has to be a trunc.

48821

if (N0.getOpcode() != ISD::TRUNCATE)

48822

return SDValue();

48823

48824

// The type of the truncated inputs.

48825

if (N0.getOperand(0).getValueType() != VT)

48826

return SDValue();

48827

48828

N0 = N0.getOperand(0);

48829

}

48830

48831

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

48832

N1 = NN1;

48833

else {

48834

// The right side has to be a 'trunc' or a constant vector.

48835

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

48836

N1.getOperand(0).getValueType() == VT;

48837

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

48838

return SDValue();

48839

48840

if (RHSTrunc)

48841

N1 = N1.getOperand(0);

48842

else

48843

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

48844

}

48845

48846

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

48847

}

48848

48849

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

48850

// register. In most cases we actually compare or select YMM-sized registers

48851

// and mixing the two types creates horrible code. This method optimizes

48852

// some of the transition sequences.

48853

// Even with AVX-512 this is still useful for removing casts around logical

48854

// operations on vXi1 mask types.

48855

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

48856

const X86Subtarget &Subtarget) {

48857

EVT VT = N->getValueType(0);

48858

assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48858, __extension__
__PRETTY_FUNCTION__));

48859

48860

SDLoc DL(N);

48861

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48863, __extension__
__PRETTY_FUNCTION__))

48862

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48863, __extension__
__PRETTY_FUNCTION__))

48863

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48863, __extension__
__PRETTY_FUNCTION__));

48864

48865

SDValue Narrow = N->getOperand(0);

48866

EVT NarrowVT = Narrow.getValueType();

48867

48868

// Generate the wide operation.

48869

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

48870

if (!Op)

48871

return SDValue();

48872

switch (N->getOpcode()) {

48873

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48873);

48874

case ISD::ANY_EXTEND:

48875

return Op;

48876

case ISD::ZERO_EXTEND:

48877

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

48878

case ISD::SIGN_EXTEND:

48879

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

48880

Op, DAG.getValueType(NarrowVT));

48881

}

48882

}

48883

48884

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

48885

unsigned FPOpcode;

48886

switch (Opcode) {

48887

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48887);

48888

case ISD::AND: FPOpcode = X86ISD::FAND; break;

48889

case ISD::OR: FPOpcode = X86ISD::FOR; break;

48890

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

48891

}

48892

return FPOpcode;

48893

}

48894

48895

/// If both input operands of a logic op are being cast from floating-point

48896

/// types or FP compares, try to convert this into a floating-point logic node

48897

/// to avoid unnecessary moves from SSE to integer registers.

48898

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

48899

TargetLowering::DAGCombinerInfo &DCI,

48900

const X86Subtarget &Subtarget) {

48901

EVT VT = N->getValueType(0);

48902

SDValue N0 = N->getOperand(0);

48903

SDValue N1 = N->getOperand(1);

48904

SDLoc DL(N);

48905

48906

if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

48907

(N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

48908

return SDValue();

48909

48910

SDValue N00 = N0.getOperand(0);

48911

SDValue N10 = N1.getOperand(0);

48912

EVT N00Type = N00.getValueType();

48913

EVT N10Type = N10.getValueType();

48914

48915

// Ensure that both types are the same and are legal scalar fp types.

48916

if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

48917

(Subtarget.hasSSE2() && N00Type == MVT::f64) ||

48918

(Subtarget.hasFP16() && N00Type == MVT::f16)))

48919

return SDValue();

48920

48921

if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

48922

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

48923

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

48924

return DAG.getBitcast(VT, FPLogic);

48925

}

48926

48927

if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

48928

!N1.hasOneUse())

48929

return SDValue();

48930

48931

ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

48932

ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();

48933

48934

// The vector ISA for FP predicates is incomplete before AVX, so converting

48935

// COMIS* to CMPS* may not be a win before AVX.

48936

if (!Subtarget.hasAVX() &&

48937

!(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

48938

return SDValue();

48939

48940

// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

48941

// and vector logic:

48942

// logic (setcc N00, N01), (setcc N10, N11) -->

48943

// extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

48944

unsigned NumElts = 128 / N00Type.getSizeInBits();

48945

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

48946

EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

48947

SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

48948

SDValue N01 = N0.getOperand(1);

48949

SDValue N11 = N1.getOperand(1);

48950

SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

48951

SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

48952

SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

48953

SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

48954

SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

48955

SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

48956

SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);

48957

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

48958

}

48959

48960

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

48961

// to reduce XMM->GPR traffic.

48962

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

48963

unsigned Opc = N->getOpcode();

48964

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48965, __extension__
__PRETTY_FUNCTION__))

48965

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48965, __extension__
__PRETTY_FUNCTION__));

48966

48967

SDValue N0 = N->getOperand(0);

48968

SDValue N1 = N->getOperand(1);

48969

48970

// Both operands must be single use MOVMSK.

48971

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

48972

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

48973

return SDValue();

48974

48975

SDValue Vec0 = N0.getOperand(0);

48976

SDValue Vec1 = N1.getOperand(0);

48977

EVT VecVT0 = Vec0.getValueType();

48978

EVT VecVT1 = Vec1.getValueType();

48979

48980

// Both MOVMSK operands must be from vectors of the same size and same element

48981

// size, but its OK for a fp/int diff.

48982

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

48983

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

48984

return SDValue();

48985

48986

SDLoc DL(N);

48987

unsigned VecOpc =

48988

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

48989

SDValue Result =

48990

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

48991

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

48992

}

48993

48994

// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

48995

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

48996

// handles in InstCombine.

48997

static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {

48998

unsigned Opc = N->getOpcode();

48999

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49000, __extension__
__PRETTY_FUNCTION__))

49000

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49000, __extension__
__PRETTY_FUNCTION__));

49001

49002

SDValue N0 = N->getOperand(0);

49003

SDValue N1 = N->getOperand(1);

49004

EVT VT = N->getValueType(0);

49005

49006

// Both operands must be single use.

49007

if (!N0.hasOneUse() || !N1.hasOneUse())

49008

return SDValue();

49009

49010

// Search for matching shifts.

49011

SDValue BC0 = peekThroughOneUseBitcasts(N0);

49012

SDValue BC1 = peekThroughOneUseBitcasts(N1);

49013

49014

unsigned BCOpc = BC0.getOpcode();

49015

EVT BCVT = BC0.getValueType();

49016

if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

49017

return SDValue();

49018

49019

switch (BCOpc) {

49020

case X86ISD::VSHLI:

49021

case X86ISD::VSRLI:

49022

case X86ISD::VSRAI: {

49023

if (BC0.getOperand(1) != BC1.getOperand(1))

49024

return SDValue();

49025

49026

SDLoc DL(N);

49027

SDValue BitOp =

49028

DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

49029

SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

49030

return DAG.getBitcast(VT, Shift);

49031

}

49032

}

49033

49034

return SDValue();

49035

}

49036

49037

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

49038

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

49039

/// with a shift-right to eliminate loading the vector constant mask value.

49040

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

49041

const X86Subtarget &Subtarget) {

49042

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

49043

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

49044

EVT VT = Op0.getValueType();

49045

if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

49046

return SDValue();

49047

49048

// Try to convert an "is positive" signbit masking operation into arithmetic

49049

// shift and "andn". This saves a materialization of a -1 vector constant.

49050

// The "is negative" variant should be handled more generally because it only

49051

// requires "and" rather than "andn":

49052

// and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

49053

//

49054

// This is limited to the original type to avoid producing even more bitcasts.

49055

// If the bitcasts can't be eliminated, then it is unlikely that this fold

49056

// will be profitable.

49057

if (N->getValueType(0) == VT &&

49058

supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {

49059

SDValue X, Y;

49060

if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&

49061

isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {

49062

X = Op1.getOperand(0);

49063

Y = Op0;

49064

} else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&

49065

isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {

49066

X = Op0.getOperand(0);

49067

Y = Op1;

49068

}

49069

if (X && Y) {

49070

SDLoc DL(N);

49071

SDValue Sra =

49072

getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

49073

VT.getScalarSizeInBits() - 1, DAG);

49074

return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

49075

}

49076

}

49077

49078

APInt SplatVal;

49079

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

49080

!SplatVal.isMask())

49081

return SDValue();

49082

49083

// Don't prevent creation of ANDN.

49084

if (isBitwiseNot(Op0))

49085

return SDValue();

49086

49087

if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))

49088

return SDValue();

49089

49090

unsigned EltBitWidth = VT.getScalarSizeInBits();

49091

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

49092

return SDValue();

49093

49094

SDLoc DL(N);

49095

unsigned ShiftVal = SplatVal.countr_one();

49096

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

49097

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

49098

return DAG.getBitcast(N->getValueType(0), Shift);

49099

}

49100

49101

// Get the index node from the lowered DAG of a GEP IR instruction with one

49102

// indexing dimension.

49103

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

49104

if (Ld->isIndexed())

49105

return SDValue();

49106

49107

SDValue Base = Ld->getBasePtr();

49108

49109

if (Base.getOpcode() != ISD::ADD)

49110

return SDValue();

49111

49112

SDValue ShiftedIndex = Base.getOperand(0);

49113

49114

if (ShiftedIndex.getOpcode() != ISD::SHL)

49115

return SDValue();

49116

49117

return ShiftedIndex.getOperand(0);

49118

49119

}

49120

49121

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

49122

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

49123

switch (VT.getSizeInBits()) {

49124

default: return false;

49125

case 64: return Subtarget.is64Bit() ? true : false;

49126

case 32: return true;

49127

}

49128

}

49129

return false;

49130

}

49131

49132

// This function recognizes cases where X86 bzhi instruction can replace and

49133

// 'and-load' sequence.

49134

// In case of loading integer value from an array of constants which is defined

49135

// as follows:

49136

//

49137

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

49138

//

49139

// then applying a bitwise and on the result with another input.

49140

// It's equivalent to performing bzhi (zero high bits) on the input, with the

49141

// same index of the load.

49142

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

49143

const X86Subtarget &Subtarget) {

49144

MVT VT = Node->getSimpleValueType(0);

49145

SDLoc dl(Node);

49146

49147

// Check if subtarget has BZHI instruction for the node's type

49148

if (!hasBZHI(Subtarget, VT))

49149

return SDValue();

49150

49151

// Try matching the pattern for both operands.

49152

for (unsigned i = 0; i < 2; i++) {

49153

SDValue N = Node->getOperand(i);

49154

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

49155

49156

// continue if the operand is not a load instruction

49157

if (!Ld)

49158

return SDValue();

49159

49160

const Value *MemOp = Ld->getMemOperand()->getValue();

49161

49162

if (!MemOp)

49163

return SDValue();

49164

49165

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

49166

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

49167

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

49168

49169

Constant *Init = GV->getInitializer();

49170

Type *Ty = Init->getType();

49171

if (!isa<ConstantDataArray>(Init) ||

49172

!Ty->getArrayElementType()->isIntegerTy() ||

49173

Ty->getArrayElementType()->getScalarSizeInBits() !=

49174

VT.getSizeInBits() ||

49175

Ty->getArrayNumElements() >

49176

Ty->getArrayElementType()->getScalarSizeInBits())

49177

continue;

49178

49179

// Check if the array's constant elements are suitable to our case.

49180

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

49181

bool ConstantsMatch = true;

49182

for (uint64_t j = 0; j < ArrayElementCount; j++) {

49183

auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

49184

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

49185

ConstantsMatch = false;

49186

break;

49187

}

49188

}

49189

if (!ConstantsMatch)

49190

continue;

49191

49192

// Do the transformation (For 32-bit type):

49193

// -> (and (load arr[idx]), inp)

49194

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

49195

// that will be replaced with one bzhi instruction.

49196

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

49197

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

49198

49199

// Get the Node which indexes into the array.

49200

SDValue Index = getIndexFromUnindexedLoad(Ld);

49201

if (!Index)

49202

return SDValue();

49203

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

49204

49205

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

49206

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

49207

49208

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

49209

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

49210

49211

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

49212

}

49213

}

49214

}

49215

}

49216

return SDValue();

49217

}

49218

49219

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

49220

// Where C is a mask containing the same number of bits as the setcc and

49221

// where the setcc will freely 0 upper bits of k-register. We can replace the

49222

// undef in the concat with 0s and remove the AND. This mainly helps with

49223

// v2i1/v4i1 setcc being casted to scalar.

49224

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

49225

const X86Subtarget &Subtarget) {

49226

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49226, __extension__
__PRETTY_FUNCTION__));

49227

49228

EVT VT = N->getValueType(0);

49229

49230

// Make sure this is an AND with constant. We will check the value of the

49231

// constant later.

49232

auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));

49233

if (!C1)

49234

return SDValue();

49235

49236

// This is implied by the ConstantSDNode.

49237

assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49237, __extension__
__PRETTY_FUNCTION__));

49238

49239

SDValue Src = N->getOperand(0);

49240

if (!Src.hasOneUse())

49241

return SDValue();

49242

49243

// (Optionally) peek through any_extend().

49244

if (Src.getOpcode() == ISD::ANY_EXTEND) {

49245

if (!Src.getOperand(0).hasOneUse())

49246

return SDValue();

49247

Src = Src.getOperand(0);

49248

}

49249

49250

if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())

49251

return SDValue();

49252

49253

Src = Src.getOperand(0);

49254

EVT SrcVT = Src.getValueType();

49255

49256

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49257

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

49258

!TLI.isTypeLegal(SrcVT))

49259

return SDValue();

49260

49261

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

49262

return SDValue();

49263

49264

// We only care about the first subvector of the concat, we expect the

49265

// other subvectors to be ignored due to the AND if we make the change.

49266

SDValue SubVec = Src.getOperand(0);

49267

EVT SubVecVT = SubVec.getValueType();

49268

49269

// The RHS of the AND should be a mask with as many bits as SubVec.

49270

if (!TLI.isTypeLegal(SubVecVT) ||

49271

!C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))

49272

return SDValue();

49273

49274

// First subvector should be a setcc with a legal result type or a

49275

// AND containing at least one setcc with a legal result type.

49276

auto IsLegalSetCC = [&](SDValue V) {

49277

if (V.getOpcode() != ISD::SETCC)

49278

return false;

49279

EVT SetccVT = V.getOperand(0).getValueType();

49280

if (!TLI.isTypeLegal(SetccVT) ||

49281

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

49282

return false;

49283

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

49284

return false;

49285

return true;

49286

};

49287

if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&

49288

(IsLegalSetCC(SubVec.getOperand(0)) ||

49289

IsLegalSetCC(SubVec.getOperand(1))))))

49290

return SDValue();

49291

49292

// We passed all the checks. Rebuild the concat_vectors with zeroes

49293

// and cast it back to VT.

49294

SDLoc dl(N);

49295

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

49296

DAG.getConstant(0, dl, SubVecVT));

49297

Ops[0] = SubVec;

49298

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

49299

Ops);

49300

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());

49301

return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);

49302

}

49303

49304

static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,

49305

SDValue OpMustEq, SDValue Op, unsigned Depth) {

49306

// We don't want to go crazy with the recursion here. This isn't a super

49307

// important optimization.

49308

static constexpr unsigned kMaxDepth = 2;

49309

49310

// Only do this re-ordering if op has one use.

49311

if (!Op.hasOneUse())

49312

return SDValue();

49313

49314

SDLoc DL(Op);

49315

// If we hit another assosiative op, recurse further.

49316

if (Op.getOpcode() == Opc) {

49317

// Done recursing.

49318

if (Depth++ >= kMaxDepth)

49319

return SDValue();

49320

49321

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

49322

if (SDValue R =

49323

getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))

49324

return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,

49325

Op.getOperand(1 - OpIdx));

49326

49327

} else if (Op.getOpcode() == ISD::SUB) {

49328

if (Opc == ISD::AND) {

49329

// BLSI: (and x, (sub 0, x))

49330

if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)

49331

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49332

}

49333

// Opc must be ISD::AND or ISD::XOR

49334

// BLSR: (and x, (sub x, 1))

49335

// BLSMSK: (xor x, (sub x, 1))

49336

if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

49337

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49338

49339

} else if (Op.getOpcode() == ISD::ADD) {

49340

// Opc must be ISD::AND or ISD::XOR

49341

// BLSR: (and x, (add x, -1))

49342

// BLSMSK: (xor x, (add x, -1))

49343

if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

49344

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49345

}

49346

return SDValue();

49347

}

49348

49349

static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,

49350

const X86Subtarget &Subtarget) {

49351

EVT VT = N->getValueType(0);

49352

// Make sure this node is a candidate for BMI instructions.

49353

if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||

49354

(VT != MVT::i32 && VT != MVT::i64))

49355

return SDValue();

49356

49357

assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49357, __extension__
__PRETTY_FUNCTION__));

49358

49359

// Try and match LHS and RHS.

49360

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

49361

if (SDValue OpMatch =

49362

getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),

49363

N->getOperand(1 - OpIdx), 0))

49364

return OpMatch;

49365

return SDValue();

49366

}

49367

49368

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

49369

TargetLowering::DAGCombinerInfo &DCI,

49370

const X86Subtarget &Subtarget) {

49371

SDValue N0 = N->getOperand(0);

49372

SDValue N1 = N->getOperand(1);

49373

EVT VT = N->getValueType(0);

49374

SDLoc dl(N);

49375

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49376

49377

// If this is SSE1 only convert to FAND to avoid scalarization.

49378

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

49379

return DAG.getBitcast(MVT::v4i32,

49380

DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

49381

DAG.getBitcast(MVT::v4f32, N0),

49382

DAG.getBitcast(MVT::v4f32, N1)));

49383

}

49384

49385

// Use a 32-bit and+zext if upper bits known zero.

49386

if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

49387

APInt HiMask = APInt::getHighBitsSet(64, 32);

49388

if (DAG.MaskedValueIsZero(N1, HiMask) ||

49389

DAG.MaskedValueIsZero(N0, HiMask)) {

49390

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

49391

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

49392

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

49393

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

49394

}

49395

}

49396

49397

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

49398

// TODO: Support multiple SrcOps.

49399

if (VT == MVT::i1) {

49400

SmallVector<SDValue, 2> SrcOps;

49401

SmallVector<APInt, 2> SrcPartials;

49402

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

49403

SrcOps.size() == 1) {

49404

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

49405

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

49406

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

49407

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

49408

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

49409

if (Mask) {

49410

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49411, __extension__
__PRETTY_FUNCTION__))

49411

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49411, __extension__
__PRETTY_FUNCTION__));

49412

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

49413

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

49414

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

49415

}

49416

}

49417

}

49418

49419

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

49420

return V;

49421

49422

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

49423

return R;

49424

49425

if (SDValue R = combineBitOpWithShift(N, DAG))

49426

return R;

49427

49428

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

49429

return FPLogic;

49430

49431

if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))

49432

return R;

49433

49434

if (DCI.isBeforeLegalizeOps())

49435

return SDValue();

49436

49437

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

49438

return R;

49439

49440

if (SDValue R = combineAndNotIntoANDNP(N, DAG))

49441

return R;

49442

49443

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

49444

return ShiftRight;

49445

49446

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

49447

return R;

49448

49449

// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

49450

// iff c2 is all/no bits mask - i.e. a select-with-zero mask.

49451

// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?

49452

if (VT.isVector() && getTargetConstantFromNode(N1)) {

49453

unsigned Opc0 = N0.getOpcode();

49454

if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&

49455

getTargetConstantFromNode(N0.getOperand(1)) &&

49456

DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&

49457

N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {

49458

SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);

49459

return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);

49460

}

49461

}

49462

49463

// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

49464

// avoids slow variable shift (moving shift amount to ECX etc.)

49465

if (isOneConstant(N1) && N0->hasOneUse()) {

49466

SDValue Src = N0;

49467

while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

49468

Src.getOpcode() == ISD::TRUNCATE) &&

49469

Src.getOperand(0)->hasOneUse())

49470

Src = Src.getOperand(0);

49471

bool ContainsNOT = false;

49472

X86::CondCode X86CC = X86::COND_B;

49473

// Peek through AND(NOT(SRL(X,Y)),1).

49474

if (isBitwiseNot(Src)) {

49475

Src = Src.getOperand(0);

49476

X86CC = X86::COND_AE;

49477

ContainsNOT = true;

49478

}

49479

if (Src.getOpcode() == ISD::SRL &&

49480

!isa<ConstantSDNode>(Src.getOperand(1))) {

49481

SDValue BitNo = Src.getOperand(1);

49482

Src = Src.getOperand(0);

49483

// Peek through AND(SRL(NOT(X),Y),1).

49484

if (isBitwiseNot(Src)) {

49485

Src = Src.getOperand(0);

49486

X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

49487

ContainsNOT = true;

49488

}

49489

// If we have BMI2 then SHRX should be faster for i32/i64 cases.

49490

if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))

49491

if (SDValue BT = getBT(Src, BitNo, dl, DAG))

49492

return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

49493

}

49494

}

49495

49496

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

49497

// Attempt to recursively combine a bitmask AND with shuffles.

49498

SDValue Op(N, 0);

49499

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49500

return Res;

49501

49502

// If either operand is a constant mask, then only the elements that aren't

49503

// zero are actually demanded by the other operand.

49504

auto GetDemandedMasks = [&](SDValue Op) {

49505

APInt UndefElts;

49506

SmallVector<APInt> EltBits;

49507

int NumElts = VT.getVectorNumElements();

49508

int EltSizeInBits = VT.getScalarSizeInBits();

49509

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

49510

APInt DemandedElts = APInt::getAllOnes(NumElts);

49511

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

49512

EltBits)) {

49513

DemandedBits.clearAllBits();

49514

DemandedElts.clearAllBits();

49515

for (int I = 0; I != NumElts; ++I) {

49516

if (UndefElts[I]) {

49517

// We can't assume an undef src element gives an undef dst - the

49518

// other src might be zero.

49519

DemandedBits.setAllBits();

49520

DemandedElts.setBit(I);

49521

} else if (!EltBits[I].isZero()) {

49522

DemandedBits |= EltBits[I];

49523

DemandedElts.setBit(I);

49524

}

49525

}

49526

}

49527

return std::make_pair(DemandedBits, DemandedElts);

49528

};

49529

APInt Bits0, Elts0;

49530

APInt Bits1, Elts1;

49531

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

49532

std::tie(Bits1, Elts1) = GetDemandedMasks(N0);

49533

49534

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

49535

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

49536

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

49537

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

49538

if (N->getOpcode() != ISD::DELETED_NODE)

49539

DCI.AddToWorklist(N);

49540

return SDValue(N, 0);

49541

}

49542

49543

SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);

49544

SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);

49545

if (NewN0 || NewN1)

49546

return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

49547

NewN1 ? NewN1 : N1);

49548

}

49549

49550

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

49551

if ((VT.getScalarSizeInBits() % 8) == 0 &&

49552

N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

49553

isa<ConstantSDNode>(N0.getOperand(1))) {

49554

SDValue BitMask = N1;

49555

SDValue SrcVec = N0.getOperand(0);

49556

EVT SrcVecVT = SrcVec.getValueType();

49557

49558

// Check that the constant bitmask masks whole bytes.

49559

APInt UndefElts;

49560

SmallVector<APInt, 64> EltBits;

49561

if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

49562

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

49563

llvm::all_of(EltBits, [](const APInt &M) {

49564

return M.isZero() || M.isAllOnes();

49565

})) {

49566

unsigned NumElts = SrcVecVT.getVectorNumElements();

49567

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

49568

unsigned Idx = N0.getConstantOperandVal(1);

49569

49570

// Create a root shuffle mask from the byte mask and the extracted index.

49571

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

49572

for (unsigned i = 0; i != Scale; ++i) {

49573

if (UndefElts[i])

49574

continue;

49575

int VecIdx = Scale * Idx + i;

49576

ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

49577

}

49578

49579

if (SDValue Shuffle = combineX86ShufflesRecursively(

49580

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

49581

X86::MaxShuffleCombineDepth,

49582

/*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,

49583

/*AllowVarPerLaneMask*/ true, DAG, Subtarget))

49584

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

49585

N0.getOperand(1));

49586

}

49587

}

49588

49589

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

49590

return R;

49591

49592

return SDValue();

49593

}

49594

49595

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

49596

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

49597

const X86Subtarget &Subtarget) {

49598

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49598, __extension__
__PRETTY_FUNCTION__));

49599

49600

MVT VT = N->getSimpleValueType(0);

49601

unsigned EltSizeInBits = VT.getScalarSizeInBits();

49602

if (!VT.isVector() || (EltSizeInBits % 8) != 0)

49603

return SDValue();

49604

49605

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

49606

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

49607

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

49608

return SDValue();

49609

49610

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

49611

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

49612

if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

49613

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

49614

return SDValue();

49615

49616

// Attempt to extract constant byte masks.

49617

APInt UndefElts0, UndefElts1;

49618

SmallVector<APInt, 32> EltBits0, EltBits1;

49619

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

49620

false, false))

49621

return SDValue();

49622

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

49623

false, false))

49624

return SDValue();

49625

49626

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

49627

// TODO - add UNDEF elts support.

49628

if (UndefElts0[i] || UndefElts1[i])

49629

return SDValue();

49630

if (EltBits0[i] != ~EltBits1[i])

49631

return SDValue();

49632

}

49633

49634

SDLoc DL(N);

49635

49636

if (useVPTERNLOG(Subtarget, VT)) {

49637

// Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

49638

// VPTERNLOG is only available as vXi32/64-bit types.

49639

MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;

49640

MVT OpVT =

49641

MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

49642

SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

49643

SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

49644

SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

49645

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

49646

SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

49647

DAG, Subtarget);

49648

return DAG.getBitcast(VT, Res);

49649

}

49650

49651

SDValue X = N->getOperand(0);

49652

SDValue Y =

49653

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

49654

DAG.getBitcast(VT, N1.getOperand(0)));

49655

return DAG.getNode(ISD::OR, DL, VT, X, Y);

49656

}

49657

49658

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

49659

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

49660

if (N->getOpcode() != ISD::OR)

49661

return false;

49662

49663

SDValue N0 = N->getOperand(0);

49664

SDValue N1 = N->getOperand(1);

49665

49666

// Canonicalize AND to LHS.

49667

if (N1.getOpcode() == ISD::AND)

49668

std::swap(N0, N1);

49669

49670

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

49671

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

49672

return false;

49673

49674

Mask = N1.getOperand(0);

49675

X = N1.getOperand(1);

49676

49677

// Check to see if the mask appeared in both the AND and ANDNP.

49678

if (N0.getOperand(0) == Mask)

49679

Y = N0.getOperand(1);

49680

else if (N0.getOperand(1) == Mask)

49681

Y = N0.getOperand(0);

49682

else

49683

return false;

49684

49685

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

49686

// ANDNP combine allows other combines to happen that prevent matching.

49687

return true;

49688

}

49689

49690

// Try to fold:

49691

// (or (and (m, y), (pandn m, x)))

49692

// into:

49693

// (vselect m, x, y)

49694

// As a special case, try to fold:

49695

// (or (and (m, (sub 0, x)), (pandn m, x)))

49696

// into:

49697

// (sub (xor X, M), M)

49698

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

49699

const X86Subtarget &Subtarget) {

49700

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49700, __extension__
__PRETTY_FUNCTION__));

49701

49702

EVT VT = N->getValueType(0);

49703

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

49704

(VT.is256BitVector() && Subtarget.hasInt256())))

49705

return SDValue();

49706

49707

SDValue X, Y, Mask;

49708

if (!matchLogicBlend(N, X, Y, Mask))

49709

return SDValue();

49710

49711

// Validate that X, Y, and Mask are bitcasts, and see through them.

49712

Mask = peekThroughBitcasts(Mask);

49713

X = peekThroughBitcasts(X);

49714

Y = peekThroughBitcasts(Y);

49715

49716

EVT MaskVT = Mask.getValueType();

49717

unsigned EltBits = MaskVT.getScalarSizeInBits();

49718

49719

// TODO: Attempt to handle floating point cases as well?

49720

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

49721

return SDValue();

49722

49723

SDLoc DL(N);

49724

49725

// Attempt to combine to conditional negate: (sub (xor X, M), M)

49726

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

49727

DAG, Subtarget))

49728

return Res;

49729

49730

// PBLENDVB is only available on SSE 4.1.

49731

if (!Subtarget.hasSSE41())

49732

return SDValue();

49733

49734

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

49735

if (Subtarget.hasVLX())

49736

return SDValue();

49737

49738

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

49739

49740

X = DAG.getBitcast(BlendVT, X);

49741

Y = DAG.getBitcast(BlendVT, Y);

49742

Mask = DAG.getBitcast(BlendVT, Mask);

49743

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

49744

return DAG.getBitcast(VT, Mask);

49745

}

49746

49747

// Helper function for combineOrCmpEqZeroToCtlzSrl

49748

// Transforms:

49749

// seteq(cmp x, 0)

49750

// into:

49751

// srl(ctlz x), log2(bitsize(x))

49752

// Input pattern is checked by caller.

49753

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

49754

SDValue Cmp = Op.getOperand(1);

49755

EVT VT = Cmp.getOperand(0).getValueType();

49756

unsigned Log2b = Log2_32(VT.getSizeInBits());

49757

SDLoc dl(Op);

49758

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

49759

// The result of the shift is true or false, and on X86, the 32-bit

49760

// encoding of shr and lzcnt is more desirable.

49761

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

49762

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

49763

DAG.getConstant(Log2b, dl, MVT::i8));

49764

return Scc;

49765

}

49766

49767

// Try to transform:

49768

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

49769

// into:

49770

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

49771

// Will also attempt to match more generic cases, eg:

49772

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

49773

// Only applies if the target supports the FastLZCNT feature.

49774

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

49775

TargetLowering::DAGCombinerInfo &DCI,

49776

const X86Subtarget &Subtarget) {

49777

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

49778

return SDValue();

49779

49780

auto isORCandidate = [](SDValue N) {

49781

return (N->getOpcode() == ISD::OR && N->hasOneUse());

49782

};

49783

49784

// Check the zero extend is extending to 32-bit or more. The code generated by

49785

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

49786

// instructions to clear the upper bits.

49787

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

49788

!isORCandidate(N->getOperand(0)))

49789

return SDValue();

49790

49791

// Check the node matches: setcc(eq, cmp 0)

49792

auto isSetCCCandidate = [](SDValue N) {

49793

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

49794

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

49795

N->getOperand(1).getOpcode() == X86ISD::CMP &&

49796

isNullConstant(N->getOperand(1).getOperand(1)) &&

49797

N->getOperand(1).getValueType().bitsGE(MVT::i32);

49798

};

49799

49800

SDNode *OR = N->getOperand(0).getNode();

49801

SDValue LHS = OR->getOperand(0);

49802

SDValue RHS = OR->getOperand(1);

49803

49804

// Save nodes matching or(or, setcc(eq, cmp 0)).

49805

SmallVector<SDNode *, 2> ORNodes;

49806

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

49807

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

49808

ORNodes.push_back(OR);

49809

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

49810

LHS = OR->getOperand(0);

49811

RHS = OR->getOperand(1);

49812

}

49813

49814

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

49815

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

49816

!isORCandidate(SDValue(OR, 0)))

49817

return SDValue();

49818

49819

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

49820

// to

49821

// or(srl(ctlz),srl(ctlz)).

49822

// The dag combiner can then fold it into:

49823

// srl(or(ctlz, ctlz)).

49824

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

49825

SDValue Ret, NewRHS;

49826

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

49827

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);

49828

49829

if (!Ret)

49830

return SDValue();

49831

49832

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

49833

while (ORNodes.size() > 0) {

49834

OR = ORNodes.pop_back_val();

49835

LHS = OR->getOperand(0);

49836

RHS = OR->getOperand(1);

49837

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

49838

if (RHS->getOpcode() == ISD::OR)

49839

std::swap(LHS, RHS);

49840

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

49841

if (!NewRHS)

49842

return SDValue();

49843

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

49844

}

49845

49846

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

49847

}

49848

49849

static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,

49850

SDValue And1_L, SDValue And1_R,

49851

const SDLoc &DL, SelectionDAG &DAG) {

49852

if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())

49853

return SDValue();

49854

SDValue NotOp = And0_L->getOperand(0);

49855

if (NotOp == And1_R)

49856

std::swap(And1_R, And1_L);

49857

if (NotOp != And1_L)

49858

return SDValue();

49859

49860

// (~(NotOp) & And0_R) | (NotOp & And1_R)

49861

// --> ((And0_R ^ And1_R) & NotOp) ^ And1_R

49862

EVT VT = And1_L->getValueType(0);

49863

SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);

49864

SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);

49865

SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);

49866

SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);

49867

return Xor1;

49868

}

49869

49870

/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the

49871

/// equivalent `((x ^ y) & m) ^ y)` pattern.

49872

/// This is typically a better representation for targets without a fused

49873

/// "and-not" operation. This function is intended to be called from a

49874

/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.

49875

static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {

49876

// Note that masked-merge variants using XOR or ADD expressions are

49877

// normalized to OR by InstCombine so we only check for OR.

49878

assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49878, __extension__
__PRETTY_FUNCTION__));

49879

SDValue N0 = Node->getOperand(0);

49880

if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())

49881

return SDValue();

49882

SDValue N1 = Node->getOperand(1);

49883

if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())

49884

return SDValue();

49885

49886

SDLoc DL(Node);

49887

SDValue N00 = N0->getOperand(0);

49888

SDValue N01 = N0->getOperand(1);

49889

SDValue N10 = N1->getOperand(0);

49890

SDValue N11 = N1->getOperand(1);

49891

if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))

49892

return Result;

49893

if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))

49894

return Result;

49895

if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))

49896

return Result;

49897

if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))

49898

return Result;

49899

return SDValue();

49900

}

49901

49902

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

49903

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

49904

/// with CMP+{ADC, SBB}.

49905

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.

49906

static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

49907

SDValue X, SDValue Y,

49908

SelectionDAG &DAG,

49909

bool ZeroSecondOpOnly = false) {

49910

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

49911

return SDValue();

49912

49913

// Look through a one-use zext.

49914

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

49915

Y = Y.getOperand(0);

49916

49917

X86::CondCode CC;

49918

SDValue EFLAGS;

49919

if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

49920

CC = (X86::CondCode)Y.getConstantOperandVal(0);

49921

EFLAGS = Y.getOperand(1);

49922

} else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

49923

Y.hasOneUse()) {

49924

EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

49925

}

49926

49927

if (!EFLAGS)

49928

return SDValue();

49929

49930

// If X is -1 or 0, then we have an opportunity to avoid constants required in

49931

// the general case below.

49932

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

49933

if (ConstantX && !ZeroSecondOpOnly) {

49934

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

49935

(IsSub && CC == X86::COND_B && ConstantX->isZero())) {

49936

// This is a complicated way to get -1 or 0 from the carry flag:

49937

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

49938

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

49939

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

49940

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

49941

EFLAGS);

49942

}

49943

49944

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

49945

(IsSub && CC == X86::COND_A && ConstantX->isZero())) {

49946

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

49947

EFLAGS.getValueType().isInteger() &&

49948

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

49949

// Swap the operands of a SUB, and we have the same pattern as above.

49950

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

49951

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

49952

SDValue NewSub = DAG.getNode(

49953

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

49954

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

49955

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

49956

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

49957

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

49958

NewEFLAGS);

49959

}

49960

}

49961

}

49962

49963

if (CC == X86::COND_B) {

49964

// X + SETB Z --> adc X, 0

49965

// X - SETB Z --> sbb X, 0

49966

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

49967

DAG.getVTList(VT, MVT::i32), X,

49968

DAG.getConstant(0, DL, VT), EFLAGS);

49969

}

49970

49971

if (ZeroSecondOpOnly)

49972

return SDValue();

49973

49974

if (CC == X86::COND_A) {

49975

// Try to convert COND_A into COND_B in an attempt to facilitate

49976

// materializing "setb reg".

49977

//

49978

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

49979

// cannot take an immediate as its first operand.

49980

//

49981

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

49982

EFLAGS.getValueType().isInteger() &&

49983

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

49984

SDValue NewSub =

49985

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

49986

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

49987

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

49988

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

49989

DAG.getVTList(VT, MVT::i32), X,

49990

DAG.getConstant(0, DL, VT), NewEFLAGS);

49991

}

49992

}

49993

49994

if (CC == X86::COND_AE) {

49995

// X + SETAE --> sbb X, -1

49996

// X - SETAE --> adc X, -1

49997

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

49998

DAG.getVTList(VT, MVT::i32), X,

49999

DAG.getConstant(-1, DL, VT), EFLAGS);

50000

}

50001

50002

if (CC == X86::COND_BE) {

50003

// X + SETBE --> sbb X, -1

50004

// X - SETBE --> adc X, -1

50005

// Try to convert COND_BE into COND_AE in an attempt to facilitate

50006

// materializing "setae reg".

50007

//

50008

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

50009

// cannot take an immediate as its first operand.

50010

//

50011

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50012

EFLAGS.getValueType().isInteger() &&

50013

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50014

SDValue NewSub =

50015

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50016

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50017

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50018

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50019

DAG.getVTList(VT, MVT::i32), X,

50020

DAG.getConstant(-1, DL, VT), NewEFLAGS);

50021

}

50022

}

50023

50024

if (CC != X86::COND_E && CC != X86::COND_NE)

50025

return SDValue();

50026

50027

if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

50028

!X86::isZeroNode(EFLAGS.getOperand(1)) ||

50029

!EFLAGS.getOperand(0).getValueType().isInteger())

50030

return SDValue();

50031

50032

SDValue Z = EFLAGS.getOperand(0);

50033

EVT ZVT = Z.getValueType();

50034

50035

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50036

// the general case below.

50037

if (ConstantX) {

50038

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

50039

// fake operands:

50040

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

50041

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

50042

if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

50043

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

50044

SDValue Zero = DAG.getConstant(0, DL, ZVT);

50045

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50046

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

50047

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50048

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50049

SDValue(Neg.getNode(), 1));

50050

}

50051

50052

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

50053

// with fake operands:

50054

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

50055

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

50056

if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

50057

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

50058

SDValue One = DAG.getConstant(1, DL, ZVT);

50059

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50060

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50061

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50062

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50063

Cmp1.getValue(1));

50064

}

50065

}

50066

50067

// (cmp Z, 1) sets the carry flag if Z is 0.

50068

SDValue One = DAG.getConstant(1, DL, ZVT);

50069

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50070

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50071

50072

// Add the flags type for ADC/SBB nodes.

50073

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

50074

50075

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

50076

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

50077

if (CC == X86::COND_NE)

50078

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

50079

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

50080

50081

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

50082

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

50083

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

50084

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

50085

}

50086

50087

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50088

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50089

/// with CMP+{ADC, SBB}.

50090

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

50091

bool IsSub = N->getOpcode() == ISD::SUB;

50092

SDValue X = N->getOperand(0);

50093

SDValue Y = N->getOperand(1);

50094

EVT VT = N->getValueType(0);

50095

SDLoc DL(N);

50096

50097

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

50098

return ADCOrSBB;

50099

50100

// Commute and try again (negate the result for subtracts).

50101

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

50102

if (IsSub)

50103

ADCOrSBB =

50104

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);

50105

return ADCOrSBB;

50106

}

50107

50108

return SDValue();

50109

}

50110

50111

static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,

50112

SelectionDAG &DAG) {

50113

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50114, __extension__
__PRETTY_FUNCTION__))

50114

"Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50114, __extension__
__PRETTY_FUNCTION__));

50115

50116

// Delegate to combineAddOrSubToADCOrSBB if we have:

50117

//

50118

// (xor/or (zero_extend (setcc)) imm)

50119

//

50120

// where imm is odd if and only if we have xor, in which case the XOR/OR are

50121

// equivalent to a SUB/ADD, respectively.

50122

if (N0.getOpcode() == ISD::ZERO_EXTEND &&

50123

N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {

50124

if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {

50125

bool IsSub = N->getOpcode() == ISD::XOR;

50126

bool N1COdd = N1C->getZExtValue() & 1;

50127

if (IsSub ? N1COdd : !N1COdd) {

50128

SDLoc DL(N);

50129

EVT VT = N->getValueType(0);

50130

if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))

50131

return R;

50132

}

50133

}

50134

}

50135

50136

return SDValue();

50137

}

50138

50139

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

50140

TargetLowering::DAGCombinerInfo &DCI,

50141

const X86Subtarget &Subtarget) {

50142

SDValue N0 = N->getOperand(0);

50143

SDValue N1 = N->getOperand(1);

50144

EVT VT = N->getValueType(0);

50145

SDLoc dl(N);

50146

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50147

50148

// If this is SSE1 only convert to FOR to avoid scalarization.

50149

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

50150

return DAG.getBitcast(MVT::v4i32,

50151

DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

50152

DAG.getBitcast(MVT::v4f32, N0),

50153

DAG.getBitcast(MVT::v4f32, N1)));

50154

}

50155

50156

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

50157

// TODO: Support multiple SrcOps.

50158

if (VT == MVT::i1) {

50159

SmallVector<SDValue, 2> SrcOps;

50160

SmallVector<APInt, 2> SrcPartials;

50161

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

50162

SrcOps.size() == 1) {

50163

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

50164

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50165

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

50166

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

50167

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

50168

if (Mask) {

50169

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50170, __extension__
__PRETTY_FUNCTION__))

50170

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50170, __extension__
__PRETTY_FUNCTION__));

50171

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

50172

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

50173

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

50174

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

50175

}

50176

}

50177

}

50178

50179

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50180

return R;

50181

50182

if (SDValue R = combineBitOpWithShift(N, DAG))

50183

return R;

50184

50185

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50186

return FPLogic;

50187

50188

if (DCI.isBeforeLegalizeOps())

50189

return SDValue();

50190

50191

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

50192

return R;

50193

50194

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

50195

return R;

50196

50197

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

50198

return R;

50199

50200

// (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.

50201

if ((VT == MVT::i32 || VT == MVT::i64) &&

50202

N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&

50203

isNullConstant(N0.getOperand(0))) {

50204

SDValue Cond = N0.getOperand(1);

50205

if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())

50206

Cond = Cond.getOperand(0);

50207

50208

if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {

50209

if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {

50210

uint64_t Val = CN->getZExtValue();

50211

if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {

50212

X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);

50213

CCode = X86::GetOppositeBranchCondition(CCode);

50214

SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);

50215

50216

SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);

50217

R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));

50218

R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));

50219

return R;

50220

}

50221

}

50222

}

50223

}

50224

50225

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

50226

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

50227

// iff the upper elements of the non-shifted arg are zero.

50228

// KUNPCK require 16+ bool vector elements.

50229

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

50230

unsigned NumElts = VT.getVectorNumElements();

50231

unsigned HalfElts = NumElts / 2;

50232

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

50233

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

50234

N1.getConstantOperandAPInt(1) == HalfElts &&

50235

DAG.MaskedVectorIsZero(N0, UpperElts)) {

50236

return DAG.getNode(

50237

ISD::CONCAT_VECTORS, dl, VT,

50238

extractSubVector(N0, 0, DAG, dl, HalfElts),

50239

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

50240

}

50241

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

50242

N0.getConstantOperandAPInt(1) == HalfElts &&

50243

DAG.MaskedVectorIsZero(N1, UpperElts)) {

50244

return DAG.getNode(

50245

ISD::CONCAT_VECTORS, dl, VT,

50246

extractSubVector(N1, 0, DAG, dl, HalfElts),

50247

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

50248

}

50249

}

50250

50251

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50252

// Attempt to recursively combine an OR of shuffles.

50253

SDValue Op(N, 0);

50254

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50255

return Res;

50256

50257

// If either operand is a constant mask, then only the elements that aren't

50258

// allones are actually demanded by the other operand.

50259

auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

50260

APInt UndefElts;

50261

SmallVector<APInt> EltBits;

50262

int NumElts = VT.getVectorNumElements();

50263

int EltSizeInBits = VT.getScalarSizeInBits();

50264

if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

50265

return false;

50266

50267

APInt DemandedElts = APInt::getZero(NumElts);

50268

for (int I = 0; I != NumElts; ++I)

50269

if (!EltBits[I].isAllOnes())

50270

DemandedElts.setBit(I);

50271

50272

return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

50273

};

50274

if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

50275

if (N->getOpcode() != ISD::DELETED_NODE)

50276

DCI.AddToWorklist(N);

50277

return SDValue(N, 0);

50278

}

50279

}

50280

50281

// We should fold "masked merge" patterns when `andn` is not available.

50282

if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)

50283

if (SDValue R = foldMaskedMerge(N, DAG))

50284

return R;

50285

50286

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

50287

return R;

50288

50289

return SDValue();

50290

}

50291

50292

/// Try to turn tests against the signbit in the form of:

50293

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

50294

/// into:

50295

/// SETGT(X, -1)

50296

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

50297

// This is only worth doing if the output type is i8 or i1.

50298

EVT ResultType = N->getValueType(0);

50299

if (ResultType != MVT::i8 && ResultType != MVT::i1)

50300

return SDValue();

50301

50302

SDValue N0 = N->getOperand(0);

50303

SDValue N1 = N->getOperand(1);

50304

50305

// We should be performing an xor against a truncated shift.

50306

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

50307

return SDValue();

50308

50309

// Make sure we are performing an xor against one.

50310

if (!isOneConstant(N1))

50311

return SDValue();

50312

50313

// SetCC on x86 zero extends so only act on this if it's a logical shift.

50314

SDValue Shift = N0.getOperand(0);

50315

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

50316

return SDValue();

50317

50318

// Make sure we are truncating from one of i16, i32 or i64.

50319

EVT ShiftTy = Shift.getValueType();

50320

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

50321

return SDValue();

50322

50323

// Make sure the shift amount extracts the sign bit.

50324

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

50325

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

50326

return SDValue();

50327

50328

// Create a greater-than comparison against -1.

50329

// N.B. Using SETGE against 0 works but we want a canonical looking

50330

// comparison, using SETGT matches up with what TranslateX86CC.

50331

SDLoc DL(N);

50332

SDValue ShiftOp = Shift.getOperand(0);

50333

EVT ShiftOpTy = ShiftOp.getValueType();

50334

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50335

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

50336

*DAG.getContext(), ResultType);

50337

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

50338

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

50339

if (SetCCResultType != ResultType)

50340

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

50341

return Cond;

50342

}

50343

50344

/// Turn vector tests of the signbit in the form of:

50345

/// xor (sra X, elt_size(X)-1), -1

50346

/// into:

50347

/// pcmpgt X, -1

50348

///

50349

/// This should be called before type legalization because the pattern may not

50350

/// persist after that.

50351

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

50352

const X86Subtarget &Subtarget) {

50353

EVT VT = N->getValueType(0);

50354

if (!VT.isSimple())

50355

return SDValue();

50356

50357

switch (VT.getSimpleVT().SimpleTy) {

50358

default: return SDValue();

50359

case MVT::v16i8:

50360

case MVT::v8i16:

50361

case MVT::v4i32:

50362

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

50363

case MVT::v32i8:

50364

case MVT::v16i16:

50365

case MVT::v8i32:

50366

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

50367

}

50368

50369

// There must be a shift right algebraic before the xor, and the xor must be a

50370

// 'not' operation.

50371

SDValue Shift = N->getOperand(0);

50372

SDValue Ones = N->getOperand(1);

50373

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

50374

!ISD::isBuildVectorAllOnes(Ones.getNode()))

50375

return SDValue();

50376

50377

// The shift should be smearing the sign bit across each vector element.

50378

auto *ShiftAmt =

50379

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

50380

if (!ShiftAmt ||

50381

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

50382

return SDValue();

50383

50384

// Create a greater-than comparison against -1. We don't use the more obvious

50385

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

50386

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

50387

}

50388

50389

/// Detect patterns of truncation with unsigned saturation:

50390

///

50391

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

50392

/// Return the source value x to be truncated or SDValue() if the pattern was

50393

/// not matched.

50394

///

50395

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

50396

/// where C1 >= 0 and C2 is unsigned max of destination type.

50397

///

50398

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

50399

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

50400

///

50401

/// These two patterns are equivalent to:

50402

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

50403

/// So return the smax(x, C1) value to be truncated or SDValue() if the

50404

/// pattern was not matched.

50405

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

50406

const SDLoc &DL) {

50407

EVT InVT = In.getValueType();

50408

50409

// Saturation with truncation. We truncate from InVT to VT.

50410

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50411, __extension__
__PRETTY_FUNCTION__))

50411

"Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50411, __extension__
__PRETTY_FUNCTION__));

50412

50413

// Match min/max and return limit value as a parameter.

50414

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

50415

if (V.getOpcode() == Opcode &&

50416

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

50417

return V.getOperand(0);

50418

return SDValue();

50419

};

50420

50421

APInt C1, C2;

50422

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

50423

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

50424

// the element size of the destination type.

50425

if (C2.isMask(VT.getScalarSizeInBits()))

50426

return UMin;

50427

50428

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

50429

if (MatchMinMax(SMin, ISD::SMAX, C1))

50430

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

50431

return SMin;

50432

50433

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

50434

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

50435

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

50436

C2.uge(C1)) {

50437

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

50438

}

50439

50440

return SDValue();

50441

}

50442

50443

/// Detect patterns of truncation with signed saturation:

50444

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

50445

/// signed_max_of_dest_type)) to dest_type)

50446

/// or:

50447

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

50448

/// signed_min_of_dest_type)) to dest_type).

50449

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

50450

/// Return the source value to be truncated or SDValue() if the pattern was not

50451

/// matched.

50452

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

50453

unsigned NumDstBits = VT.getScalarSizeInBits();

50454

unsigned NumSrcBits = In.getScalarValueSizeInBits();

50455

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50455, __extension__
__PRETTY_FUNCTION__));

50456

50457

auto MatchMinMax = [](SDValue V, unsigned Opcode,

50458

const APInt &Limit) -> SDValue {

50459

APInt C;

50460

if (V.getOpcode() == Opcode &&

50461

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

50462

return V.getOperand(0);

50463

return SDValue();

50464

};

50465

50466

APInt SignedMax, SignedMin;

50467

if (MatchPackUS) {

50468

SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

50469

SignedMin = APInt(NumSrcBits, 0);

50470

} else {

50471

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

50472

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

50473

}

50474

50475

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

50476

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

50477

return SMax;

50478

50479

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

50480

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

50481

return SMin;

50482

50483

return SDValue();

50484

}

50485

50486

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

50487

SelectionDAG &DAG,

50488

const X86Subtarget &Subtarget) {

50489

if (!Subtarget.hasSSE2() || !VT.isVector())

50490

return SDValue();

50491

50492

EVT SVT = VT.getVectorElementType();

50493

EVT InVT = In.getValueType();

50494

EVT InSVT = InVT.getVectorElementType();

50495

50496

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

50497

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

50498

// and concatenate at the same time. Then we can use a final vpmovuswb to

50499

// clip to 0-255.

50500

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

50501

InVT == MVT::v16i32 && VT == MVT::v16i8) {

50502

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

50503

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

50504

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

50505

DL, DAG, Subtarget);

50506

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50506, __extension__
__PRETTY_FUNCTION__));

50507

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

50508

}

50509

}

50510

50511

// vXi32 truncate instructions are available with AVX512F.

50512

// vXi16 truncate instructions are only available with AVX512BW.

50513

// For 256-bit or smaller vectors, we require VLX.

50514

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

50515

// If the result type is 256-bits or larger and we have disable 512-bit

50516

// registers, we should go ahead and use the pack instructions if possible.

50517

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

50518

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

50519

(InVT.getSizeInBits() > 128) &&

50520

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

50521

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

50522

50523

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

50524

VT.getSizeInBits() >= 64 &&

50525

(SVT == MVT::i8 || SVT == MVT::i16) &&

50526

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

50527

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

50528

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

50529

// Only do this when the result is at least 64 bits or we'll leaving

50530

// dangling PACKSSDW nodes.

50531

if (SVT == MVT::i8 && InSVT == MVT::i32) {

50532

EVT MidVT = VT.changeVectorElementType(MVT::i16);

50533

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

50534

DAG, Subtarget);

50535

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50535, __extension__
__PRETTY_FUNCTION__));

50536

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

50537

Subtarget);

50538

assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50538, __extension__ __PRETTY_FUNCTION__));

50539

return V;

50540

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

50541

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

50542

Subtarget);

50543

}

50544

if (SDValue SSatVal = detectSSatPattern(In, VT))

50545

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

50546

Subtarget);

50547

}

50548

50549

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50550

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

50551

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

50552

(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

50553

unsigned TruncOpc = 0;

50554

SDValue SatVal;

50555

if (SDValue SSatVal = detectSSatPattern(In, VT)) {

50556

SatVal = SSatVal;

50557

TruncOpc = X86ISD::VTRUNCS;

50558

} else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {

50559

SatVal = USatVal;

50560

TruncOpc = X86ISD::VTRUNCUS;

50561

}

50562

if (SatVal) {

50563

unsigned ResElts = VT.getVectorNumElements();

50564

// If the input type is less than 512 bits and we don't have VLX, we need

50565

// to widen to 512 bits.

50566

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

50567

unsigned NumConcats = 512 / InVT.getSizeInBits();

50568

ResElts *= NumConcats;

50569

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

50570

ConcatOps[0] = SatVal;

50571

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

50572

NumConcats * InVT.getVectorNumElements());

50573

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

50574

}

50575

// Widen the result if its narrower than 128 bits.

50576

if (ResElts * SVT.getSizeInBits() < 128)

50577

ResElts = 128 / SVT.getSizeInBits();

50578

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

50579

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

50580

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

50581

DAG.getIntPtrConstant(0, DL));

50582

}

50583

}

50584

50585

return SDValue();

50586

}

50587

50588

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

50589

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

50590

/// ISD::AVGCEILU (AVG) instruction.

50591

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

50592

const X86Subtarget &Subtarget,

50593

const SDLoc &DL) {

50594

if (!VT.isVector())

50595

return SDValue();

50596

EVT InVT = In.getValueType();

50597

unsigned NumElems = VT.getVectorNumElements();

50598

50599

EVT ScalarVT = VT.getVectorElementType();

50600

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

50601

return SDValue();

50602

50603

// InScalarVT is the intermediate type in AVG pattern and it should be greater

50604

// than the original input type (i8/i16).

50605

EVT InScalarVT = InVT.getVectorElementType();

50606

if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())

50607

return SDValue();

50608

50609

if (!Subtarget.hasSSE2())

50610

return SDValue();

50611

50612

// Detect the following pattern:

50613

//

50614

// %1 = zext <N x i8> %a to <N x i32>

50615

// %2 = zext <N x i8> %b to <N x i32>

50616

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

50617

// %4 = add nuw nsw <N x i32> %3, %2

50618

// %5 = lshr <N x i32> %N, <i32 1 x N>

50619

// %6 = trunc <N x i32> %5 to <N x i8>

50620

//

50621

// In AVX512, the last instruction can also be a trunc store.

50622

if (In.getOpcode() != ISD::SRL)

50623

return SDValue();

50624

50625

// A lambda checking the given SDValue is a constant vector and each element

50626

// is in the range [Min, Max].

50627

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

50628

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

50629

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

50630

});

50631

};

50632

50633

auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {

50634

unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();

50635

return MaxActiveBits <= ScalarVT.getSizeInBits();

50636

};

50637

50638

// Check if each element of the vector is right-shifted by one.

50639

SDValue LHS = In.getOperand(0);

50640

SDValue RHS = In.getOperand(1);

50641

if (!IsConstVectorInRange(RHS, 1, 1))

50642

return SDValue();

50643

if (LHS.getOpcode() != ISD::ADD)

50644

return SDValue();

50645

50646

// Detect a pattern of a + b + 1 where the order doesn't matter.

50647

SDValue Operands[3];

50648

Operands[0] = LHS.getOperand(0);

50649

Operands[1] = LHS.getOperand(1);

50650

50651

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

50652

ArrayRef<SDValue> Ops) {

50653

return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);

50654

};

50655

50656

auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

50657

for (SDValue &Op : Ops)

50658

if (Op.getValueType() != VT)

50659

Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

50660

// Pad to a power-of-2 vector, split+apply and extract the original vector.

50661

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

50662

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

50663

if (NumElemsPow2 != NumElems) {

50664

for (SDValue &Op : Ops) {

50665

SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));

50666

for (unsigned i = 0; i != NumElems; ++i) {

50667

SDValue Idx = DAG.getIntPtrConstant(i, DL);

50668

EltsOfOp[i] =

50669

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);

50670

}

50671

Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);

50672

}

50673

}

50674

SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);

50675

if (NumElemsPow2 == NumElems)

50676

return Res;

50677

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

50678

DAG.getIntPtrConstant(0, DL));

50679

};

50680

50681

// Take care of the case when one of the operands is a constant vector whose

50682

// element is in the range [1, 256].

50683

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

50684

IsZExtLike(Operands[0])) {

50685

// The pattern is detected. Subtract one from the constant vector, then

50686

// demote it and emit X86ISD::AVG instruction.

50687

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

50688

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

50689

return AVGSplitter({Operands[0], Operands[1]});

50690

}

50691

50692

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

50693

// Match the or case only if its 'add-like' - can be replaced by an add.

50694

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

50695

if (ISD::ADD == V.getOpcode()) {

50696

Op0 = V.getOperand(0);

50697

Op1 = V.getOperand(1);

50698

return true;

50699

}

50700

if (ISD::ZERO_EXTEND != V.getOpcode())

50701

return false;

50702

V = V.getOperand(0);

50703

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

50704

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

50705

return false;

50706

Op0 = V.getOperand(0);

50707

Op1 = V.getOperand(1);

50708

return true;

50709

};

50710

50711

SDValue Op0, Op1;

50712

if (FindAddLike(Operands[0], Op0, Op1))

50713

std::swap(Operands[0], Operands[1]);

50714

else if (!FindAddLike(Operands[1], Op0, Op1))

50715

return SDValue();

50716

Operands[2] = Op0;

50717

Operands[1] = Op1;

50718

50719

// Now we have three operands of two additions. Check that one of them is a

50720

// constant vector with ones, and the other two can be promoted from i8/i16.

50721

for (SDValue &Op : Operands) {

50722

if (!IsConstVectorInRange(Op, 1, 1))

50723

continue;

50724

std::swap(Op, Operands[2]);

50725

50726

// Check if Operands[0] and Operands[1] are results of type promotion.

50727

for (int j = 0; j < 2; ++j)

50728

if (Operands[j].getValueType() != VT)

50729

if (!IsZExtLike(Operands[j]))

50730

return SDValue();

50731

50732

// The pattern is detected, emit X86ISD::AVG instruction(s).

50733

return AVGSplitter({Operands[0], Operands[1]});

50734

}

50735

50736

return SDValue();

50737

}

50738

50739

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

50740

TargetLowering::DAGCombinerInfo &DCI,

50741

const X86Subtarget &Subtarget) {

50742

LoadSDNode *Ld = cast<LoadSDNode>(N);

50743

EVT RegVT = Ld->getValueType(0);

50744

EVT MemVT = Ld->getMemoryVT();

50745

SDLoc dl(Ld);

50746

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50747

50748

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

50749

// into two 16-byte operations. Also split non-temporal aligned loads on

50750

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

50751

ISD::LoadExtType Ext = Ld->getExtensionType();

50752

unsigned Fast;

50753

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

50754

Ext == ISD::NON_EXTLOAD &&

50755

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

50756

Ld->getAlign() >= Align(16)) ||

50757

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

50758

*Ld->getMemOperand(), &Fast) &&

50759

!Fast))) {

50760

unsigned NumElems = RegVT.getVectorNumElements();

50761

if (NumElems < 2)

50762

return SDValue();

50763

50764

unsigned HalfOffset = 16;

50765

SDValue Ptr1 = Ld->getBasePtr();

50766

SDValue Ptr2 =

50767

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

50768

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

50769

NumElems / 2);

50770

SDValue Load1 =

50771

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

50772

Ld->getOriginalAlign(),

50773

Ld->getMemOperand()->getFlags());

50774

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

50775

Ld->getPointerInfo().getWithOffset(HalfOffset),

50776

Ld->getOriginalAlign(),

50777

Ld->getMemOperand()->getFlags());

50778

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

50779

Load1.getValue(1), Load2.getValue(1));

50780

50781

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

50782

return DCI.CombineTo(N, NewVec, TF, true);

50783

}

50784

50785

// Bool vector load - attempt to cast to an integer, as we have good

50786

// (vXiY *ext(vXi1 bitcast(iX))) handling.

50787

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

50788

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

50789

unsigned NumElts = RegVT.getVectorNumElements();

50790

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50791

if (TLI.isTypeLegal(IntVT)) {

50792

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

50793

Ld->getPointerInfo(),

50794

Ld->getOriginalAlign(),

50795

Ld->getMemOperand()->getFlags());

50796

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

50797

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

50798

}

50799

}

50800

50801

// If we also broadcast this as a subvector to a wider type, then just extract

50802

// the lowest subvector.

50803

if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

50804

(RegVT.is128BitVector() || RegVT.is256BitVector())) {

50805

SDValue Ptr = Ld->getBasePtr();

50806

SDValue Chain = Ld->getChain();

50807

for (SDNode *User : Ptr->uses()) {

50808

if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

50809

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

50810

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

50811

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

50812

MemVT.getSizeInBits() &&

50813

!User->hasAnyUseOfValue(1) &&

50814

User->getValueSizeInBits(0).getFixedValue() >

50815

RegVT.getFixedSizeInBits()) {

50816

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

50817

RegVT.getSizeInBits());

50818

Extract = DAG.getBitcast(RegVT, Extract);

50819

return DCI.CombineTo(N, Extract, SDValue(User, 1));

50820

}

50821

}

50822

}

50823

50824

// Cast ptr32 and ptr64 pointers to the default address space before a load.

50825

unsigned AddrSpace = Ld->getAddressSpace();

50826

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

50827

AddrSpace == X86AS::PTR32_UPTR) {

50828

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

50829

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

50830

SDValue Cast =

50831

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

50832

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

50833

Ld->getOriginalAlign(),

50834

Ld->getMemOperand()->getFlags());

50835

}

50836

}

50837

50838

return SDValue();

50839

}

50840

50841

/// If V is a build vector of boolean constants and exactly one of those

50842

/// constants is true, return the operand index of that true element.

50843

/// Otherwise, return -1.

50844

static int getOneTrueElt(SDValue V) {

50845

// This needs to be a build vector of booleans.

50846

// TODO: Checking for the i1 type matches the IR definition for the mask,

50847

// but the mask check could be loosened to i8 or other types. That might

50848

// also require checking more than 'allOnesValue'; eg, the x86 HW

50849

// instructions only require that the MSB is set for each mask element.

50850

// The ISD::MSTORE comments/definition do not specify how the mask operand

50851

// is formatted.

50852

auto *BV = dyn_cast<BuildVectorSDNode>(V);

50853

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

50854

return -1;

50855

50856

int TrueIndex = -1;

50857

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

50858

for (unsigned i = 0; i < NumElts; ++i) {

50859

const SDValue &Op = BV->getOperand(i);

50860

if (Op.isUndef())

50861

continue;

50862

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

50863

if (!ConstNode)

50864

return -1;

50865

if (ConstNode->getAPIntValue().countr_one() >= 1) {

50866

// If we already found a one, this is too many.

50867

if (TrueIndex >= 0)

50868

return -1;

50869

TrueIndex = i;

50870

}

50871

}

50872

return TrueIndex;

50873

}

50874

50875

/// Given a masked memory load/store operation, return true if it has one mask

50876

/// bit set. If it has one mask bit set, then also return the memory address of

50877

/// the scalar element to load/store, the vector index to insert/extract that

50878

/// scalar element, and the alignment for the scalar memory access.

50879

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

50880

SelectionDAG &DAG, SDValue &Addr,

50881

SDValue &Index, Align &Alignment,

50882

unsigned &Offset) {

50883

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

50884

if (TrueMaskElt < 0)

50885

return false;

50886

50887

// Get the address of the one scalar element that is specified by the mask

50888

// using the appropriate offset from the base pointer.

50889

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

50890

Offset = 0;

50891

Addr = MaskedOp->getBasePtr();

50892

if (TrueMaskElt != 0) {

50893

Offset = TrueMaskElt * EltVT.getStoreSize();

50894

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

50895

SDLoc(MaskedOp));

50896

}

50897

50898

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

50899

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

50900

EltVT.getStoreSize());

50901

return true;

50902

}

50903

50904

/// If exactly one element of the mask is set for a non-extending masked load,

50905

/// it is a scalar load and vector insert.

50906

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

50907

/// mask have already been optimized in IR, so we don't bother with those here.

50908

static SDValue

50909

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

50910

TargetLowering::DAGCombinerInfo &DCI,

50911

const X86Subtarget &Subtarget) {

50912

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50912, __extension__
__PRETTY_FUNCTION__));

50913

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

50914

// However, some target hooks may need to be added to know when the transform

50915

// is profitable. Endianness would also have to be considered.

50916

50917

SDValue Addr, VecIndex;

50918

Align Alignment;

50919

unsigned Offset;

50920

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

50921

return SDValue();

50922

50923

// Load the one scalar element that is specified by the mask using the

50924

// appropriate offset from the base pointer.

50925

SDLoc DL(ML);

50926

EVT VT = ML->getValueType(0);

50927

EVT EltVT = VT.getVectorElementType();

50928

50929

EVT CastVT = VT;

50930

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

50931

EltVT = MVT::f64;

50932

CastVT = VT.changeVectorElementType(EltVT);

50933

}

50934

50935

SDValue Load =

50936

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

50937

ML->getPointerInfo().getWithOffset(Offset),

50938

Alignment, ML->getMemOperand()->getFlags());

50939

50940

SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());

50941

50942

// Insert the loaded element into the appropriate place in the vector.

50943

SDValue Insert =

50944

DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

50945

Insert = DAG.getBitcast(VT, Insert);

50946

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

50947

}

50948

50949

static SDValue

50950

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

50951

TargetLowering::DAGCombinerInfo &DCI) {

50952

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50952, __extension__
__PRETTY_FUNCTION__));

50953

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

50954

return SDValue();

50955

50956

SDLoc DL(ML);

50957

EVT VT = ML->getValueType(0);

50958

50959

// If we are loading the first and last elements of a vector, it is safe and

50960

// always faster to load the whole vector. Replace the masked load with a

50961

// vector load and select.

50962

unsigned NumElts = VT.getVectorNumElements();

50963

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

50964

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

50965

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

50966

if (LoadFirstElt && LoadLastElt) {

50967

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

50968

ML->getMemOperand());

50969

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

50970

ML->getPassThru());

50971

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

50972

}

50973

50974

// Convert a masked load with a constant mask into a masked load and a select.

50975

// This allows the select operation to use a faster kind of select instruction

50976

// (for example, vblendvps -> vblendps).

50977

50978

// Don't try this if the pass-through operand is already undefined. That would

50979

// cause an infinite loop because that's what we're about to create.

50980

if (ML->getPassThru().isUndef())

50981

return SDValue();

50982

50983

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

50984

return SDValue();

50985

50986

// The new masked load has an undef pass-through operand. The select uses the

50987

// original pass-through operand.

50988

SDValue NewML = DAG.getMaskedLoad(

50989

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

50990

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

50991

ML->getAddressingMode(), ML->getExtensionType());

50992

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

50993

ML->getPassThru());

50994

50995

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

50996

}

50997

50998

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

50999

TargetLowering::DAGCombinerInfo &DCI,

51000

const X86Subtarget &Subtarget) {

51001

auto *Mld = cast<MaskedLoadSDNode>(N);

51002

51003

// TODO: Expanding load with constant mask may be optimized as well.

51004

if (Mld->isExpandingLoad())

51005

return SDValue();

51006

51007

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

51008

if (SDValue ScalarLoad =

51009

reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

51010

return ScalarLoad;

51011

51012

// TODO: Do some AVX512 subsets benefit from this transform?

51013

if (!Subtarget.hasAVX512())

51014

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

51015

return Blend;

51016

}

51017

51018

// If the mask value has been legalized to a non-boolean vector, try to

51019

// simplify ops leading up to it. We only demand the MSB of each lane.

51020

SDValue Mask = Mld->getMask();

51021

if (Mask.getScalarValueSizeInBits() != 1) {

51022

EVT VT = Mld->getValueType(0);

51023

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51024

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51025

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51026

if (N->getOpcode() != ISD::DELETED_NODE)

51027

DCI.AddToWorklist(N);

51028

return SDValue(N, 0);

51029

}

51030

if (SDValue NewMask =

51031

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51032

return DAG.getMaskedLoad(

51033

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

51034

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

51035

Mld->getAddressingMode(), Mld->getExtensionType());

51036

}

51037

51038

return SDValue();

51039

}

51040

51041

/// If exactly one element of the mask is set for a non-truncating masked store,

51042

/// it is a vector extract and scalar store.

51043

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51044

/// mask have already been optimized in IR, so we don't bother with those here.

51045

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

51046

SelectionDAG &DAG,

51047

const X86Subtarget &Subtarget) {

51048

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51049

// However, some target hooks may need to be added to know when the transform

51050

// is profitable. Endianness would also have to be considered.

51051

51052

SDValue Addr, VecIndex;

51053

Align Alignment;

51054

unsigned Offset;

51055

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

51056

return SDValue();

51057

51058

// Extract the one scalar element that is actually being stored.

51059

SDLoc DL(MS);

51060

SDValue Value = MS->getValue();

51061

EVT VT = Value.getValueType();

51062

EVT EltVT = VT.getVectorElementType();

51063

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51064

EltVT = MVT::f64;

51065

EVT CastVT = VT.changeVectorElementType(EltVT);

51066

Value = DAG.getBitcast(CastVT, Value);

51067

}

51068

SDValue Extract =

51069

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);

51070

51071

// Store that element at the appropriate offset from the base pointer.

51072

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

51073

MS->getPointerInfo().getWithOffset(Offset),

51074

Alignment, MS->getMemOperand()->getFlags());

51075

}

51076

51077

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

51078

TargetLowering::DAGCombinerInfo &DCI,

51079

const X86Subtarget &Subtarget) {

51080

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

51081

if (Mst->isCompressingStore())

51082

return SDValue();

51083

51084

EVT VT = Mst->getValue().getValueType();

51085

SDLoc dl(Mst);

51086

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51087

51088

if (Mst->isTruncatingStore())

51089

return SDValue();

51090

51091

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

51092

return ScalarStore;

51093

51094

// If the mask value has been legalized to a non-boolean vector, try to

51095

// simplify ops leading up to it. We only demand the MSB of each lane.

51096

SDValue Mask = Mst->getMask();

51097

if (Mask.getScalarValueSizeInBits() != 1) {

51098

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51099

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51100

if (N->getOpcode() != ISD::DELETED_NODE)

51101

DCI.AddToWorklist(N);

51102

return SDValue(N, 0);

51103

}

51104

if (SDValue NewMask =

51105

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51106

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

51107

Mst->getBasePtr(), Mst->getOffset(), NewMask,

51108

Mst->getMemoryVT(), Mst->getMemOperand(),

51109

Mst->getAddressingMode());

51110

}

51111

51112

SDValue Value = Mst->getValue();

51113

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

51114

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

51115

Mst->getMemoryVT())) {

51116

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

51117

Mst->getBasePtr(), Mst->getOffset(), Mask,

51118

Mst->getMemoryVT(), Mst->getMemOperand(),

51119

Mst->getAddressingMode(), true);

51120

}

51121

51122

return SDValue();

51123

}

51124

51125

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

51126

TargetLowering::DAGCombinerInfo &DCI,

51127

const X86Subtarget &Subtarget) {

51128

StoreSDNode *St = cast<StoreSDNode>(N);

51129

EVT StVT = St->getMemoryVT();

51130

SDLoc dl(St);

51131

SDValue StoredVal = St->getValue();

51132

EVT VT = StoredVal.getValueType();

51133

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51134

51135

// Convert a store of vXi1 into a store of iX and a bitcast.

51136

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

51137

VT.getVectorElementType() == MVT::i1) {

51138

51139

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

51140

StoredVal = DAG.getBitcast(NewVT, StoredVal);

51141

51142

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51143

St->getPointerInfo(), St->getOriginalAlign(),

51144

St->getMemOperand()->getFlags());

51145

}

51146

51147

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

51148

// This will avoid a copy to k-register.

51149

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

51150

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

51151

StoredVal.getOperand(0).getValueType() == MVT::i8) {

51152

SDValue Val = StoredVal.getOperand(0);

51153

// We must store zeros to the unused bits.

51154

Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

51155

return DAG.getStore(St->getChain(), dl, Val,

51156

St->getBasePtr(), St->getPointerInfo(),

51157

St->getOriginalAlign(),

51158

St->getMemOperand()->getFlags());

51159

}

51160

51161

// Widen v2i1/v4i1 stores to v8i1.

51162

if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

51163

Subtarget.hasAVX512()) {

51164

unsigned NumConcats = 8 / VT.getVectorNumElements();

51165

// We must store zeros to the unused bits.

51166

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

51167

Ops[0] = StoredVal;

51168

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

51169

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51170

St->getPointerInfo(), St->getOriginalAlign(),

51171

St->getMemOperand()->getFlags());

51172

}

51173

51174

// Turn vXi1 stores of constants into a scalar store.

51175

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

51176

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

51177

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

51178

// If its a v64i1 store without 64-bit support, we need two stores.

51179

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

51180

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

51181

StoredVal->ops().slice(0, 32));

51182

Lo = combinevXi1ConstantToInteger(Lo, DAG);

51183

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

51184

StoredVal->ops().slice(32, 32));

51185

Hi = combinevXi1ConstantToInteger(Hi, DAG);

51186

51187

SDValue Ptr0 = St->getBasePtr();

51188

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

51189

51190

SDValue Ch0 =

51191

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

51192

St->getOriginalAlign(),

51193

St->getMemOperand()->getFlags());

51194

SDValue Ch1 =

51195

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

51196

St->getPointerInfo().getWithOffset(4),

51197

St->getOriginalAlign(),

51198

St->getMemOperand()->getFlags());

51199

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

51200

}

51201

51202

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

51203

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51204

St->getPointerInfo(), St->getOriginalAlign(),

51205

St->getMemOperand()->getFlags());

51206

}

51207

51208

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

51209

// Sandy Bridge, perform two 16-byte stores.

51210

unsigned Fast;

51211

if (VT.is256BitVector() && StVT == VT &&

51212

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

51213

*St->getMemOperand(), &Fast) &&

51214

!Fast) {

51215

unsigned NumElems = VT.getVectorNumElements();

51216

if (NumElems < 2)

51217

return SDValue();

51218

51219

return splitVectorStore(St, DAG);

51220

}

51221

51222

// Split under-aligned vector non-temporal stores.

51223

if (St->isNonTemporal() && StVT == VT &&

51224

St->getAlign().value() < VT.getStoreSize()) {

51225

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

51226

// vectors or the legalizer can scalarize it to use MOVNTI.

51227

if (VT.is256BitVector() || VT.is512BitVector()) {

51228

unsigned NumElems = VT.getVectorNumElements();

51229

if (NumElems < 2)

51230

return SDValue();

51231

return splitVectorStore(St, DAG);

51232

}

51233

51234

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

51235

// to use MOVNTI.

51236

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

51237

MVT NTVT = Subtarget.hasSSE4A()

51238

? MVT::v2f64

51239

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

51240

return scalarizeVectorStore(St, NTVT, DAG);

51241

}

51242

}

51243

51244

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

51245

// supported, but avx512f is by extending to v16i32 and truncating.

51246

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

51247

St->getValue().getOpcode() == ISD::TRUNCATE &&

51248

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

51249

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

51250

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

51251

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

51252

St->getValue().getOperand(0));

51253

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

51254

MVT::v16i8, St->getMemOperand());

51255

}

51256

51257

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

51258

if (!St->isTruncatingStore() &&

51259

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

51260

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

51261

StoredVal.hasOneUse() &&

51262

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

51263

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

51264

return EmitTruncSStore(IsSigned, St->getChain(),

51265

dl, StoredVal.getOperand(0), St->getBasePtr(),

51266

VT, St->getMemOperand(), DAG);

51267

}

51268

51269

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

51270

if (!St->isTruncatingStore()) {

51271

auto IsExtractedElement = [](SDValue V) {

51272

if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

51273

V = V.getOperand(0);

51274

unsigned Opc = V.getOpcode();

51275

if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

51276

isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

51277

V.getOperand(0).hasOneUse())

51278

return V.getOperand(0);

51279

return SDValue();

51280

};

51281

if (SDValue Extract = IsExtractedElement(StoredVal)) {

51282

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

51283

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

51284

SDValue Src = Trunc.getOperand(0);

51285

MVT DstVT = Trunc.getSimpleValueType();

51286

MVT SrcVT = Src.getSimpleValueType();

51287

unsigned NumSrcElts = SrcVT.getVectorNumElements();

51288

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

51289

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

51290

if (NumTruncBits == VT.getSizeInBits() &&

51291

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

51292

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

51293

TruncVT, St->getMemOperand());

51294

}

51295

}

51296

}

51297

}

51298

51299

// Optimize trunc store (of multiple scalars) to shuffle and store.

51300

// First, pack all of the elements in one place. Next, store to memory

51301

// in fewer chunks.

51302

if (St->isTruncatingStore() && VT.isVector()) {

51303

// Check if we can detect an AVG pattern from the truncation. If yes,

51304

// replace the trunc store by a normal store with the result of X86ISD::AVG

51305

// instruction.

51306

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

51307

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

51308

Subtarget, dl))

51309

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

51310

St->getPointerInfo(), St->getOriginalAlign(),

51311

St->getMemOperand()->getFlags());

51312

51313

if (TLI.isTruncStoreLegal(VT, StVT)) {

51314

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

51315

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

51316

dl, Val, St->getBasePtr(),

51317

St->getMemoryVT(), St->getMemOperand(), DAG);

51318

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

51319

DAG, dl))

51320

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

51321

dl, Val, St->getBasePtr(),

51322

St->getMemoryVT(), St->getMemOperand(), DAG);

51323

}

51324

51325

return SDValue();

51326

}

51327

51328

// Cast ptr32 and ptr64 pointers to the default address space before a store.

51329

unsigned AddrSpace = St->getAddressSpace();

51330

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

51331

AddrSpace == X86AS::PTR32_UPTR) {

51332

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51333

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

51334

SDValue Cast =

51335

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

51336

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

51337

St->getPointerInfo(), St->getOriginalAlign(),

51338

St->getMemOperand()->getFlags(), St->getAAInfo());

51339

}

51340

}

51341

51342

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

51343

// the FP state in cases where an emms may be missing.

51344

// A preferable solution to the general problem is to figure out the right

51345

// places to insert EMMS. This qualifies as a quick hack.

51346

51347

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

51348

if (VT.getSizeInBits() != 64)

51349

return SDValue();

51350

51351

const Function &F = DAG.getMachineFunction().getFunction();

51352

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

51353

bool F64IsLegal =

51354

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

51355

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

51356

isa<LoadSDNode>(St->getValue()) &&

51357

cast<LoadSDNode>(St->getValue())->isSimple() &&

51358

St->getChain().hasOneUse() && St->isSimple()) {

51359

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

51360

51361

if (!ISD::isNormalLoad(Ld))

51362

return SDValue();

51363

51364

// Avoid the transformation if there are multiple uses of the loaded value.

51365

if (!Ld->hasNUsesOfValue(1, 0))

51366

return SDValue();

51367

51368

SDLoc LdDL(Ld);

51369

SDLoc StDL(N);

51370

// Lower to a single movq load/store pair.

51371

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

51372

Ld->getBasePtr(), Ld->getMemOperand());

51373

51374

// Make sure new load is placed in same chain order.

51375

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

51376

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

51377

St->getMemOperand());

51378

}

51379

51380

// This is similar to the above case, but here we handle a scalar 64-bit

51381

// integer store that is extracted from a vector on a 32-bit target.

51382

// If we have SSE2, then we can treat it like a floating-point double

51383

// to get past legalization. The execution dependencies fixup pass will

51384

// choose the optimal machine instruction for the store if this really is

51385

// an integer or v2f32 rather than an f64.

51386

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

51387

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

51388

SDValue OldExtract = St->getOperand(1);

51389

SDValue ExtOp0 = OldExtract.getOperand(0);

51390

unsigned VecSize = ExtOp0.getValueSizeInBits();

51391

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

51392

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

51393

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

51394

BitCast, OldExtract.getOperand(1));

51395

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

51396

St->getPointerInfo(), St->getOriginalAlign(),

51397

St->getMemOperand()->getFlags());

51398

}

51399

51400

return SDValue();

51401

}

51402

51403

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

51404

TargetLowering::DAGCombinerInfo &DCI,

51405

const X86Subtarget &Subtarget) {

51406

auto *St = cast<MemIntrinsicSDNode>(N);

51407

51408

SDValue StoredVal = N->getOperand(1);

51409

MVT VT = StoredVal.getSimpleValueType();

51410

EVT MemVT = St->getMemoryVT();

51411

51412

// Figure out which elements we demand.

51413

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

51414

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

51415

51416

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51417

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

51418

if (N->getOpcode() != ISD::DELETED_NODE)

51419

DCI.AddToWorklist(N);

51420

return SDValue(N, 0);

51421

}

51422

51423

return SDValue();

51424

}

51425

51426

/// Return 'true' if this vector operation is "horizontal"

51427

/// and return the operands for the horizontal operation in LHS and RHS. A

51428

/// horizontal operation performs the binary operation on successive elements

51429

/// of its first operand, then on successive elements of its second operand,

51430

/// returning the resulting values in a vector. For example, if

51431

/// A = < float a0, float a1, float a2, float a3 >

51432

/// and

51433

/// B = < float b0, float b1, float b2, float b3 >

51434

/// then the result of doing a horizontal operation on A and B is

51435

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

51436

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

51437

/// A horizontal-op B, for some already available A and B, and if so then LHS is

51438

/// set to A, RHS to B, and the routine returns 'true'.

51439

static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

51440

SelectionDAG &DAG, const X86Subtarget &Subtarget,

51441

bool IsCommutative,

51442

SmallVectorImpl<int> &PostShuffleMask) {

51443

// If either operand is undef, bail out. The binop should be simplified.

51444

if (LHS.isUndef() || RHS.isUndef())

51445

return false;

51446

51447

// Look for the following pattern:

51448

// A = < float a0, float a1, float a2, float a3 >

51449

// B = < float b0, float b1, float b2, float b3 >

51450

// and

51451

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

51452

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

51453

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

51454

// which is A horizontal-op B.

51455

51456

MVT VT = LHS.getSimpleValueType();

51457

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51458, __extension__
__PRETTY_FUNCTION__))

51458

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51458, __extension__
__PRETTY_FUNCTION__));

51459

unsigned NumElts = VT.getVectorNumElements();

51460

51461

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

51462

SmallVectorImpl<int> &ShuffleMask) {

51463

bool UseSubVector = false;

51464

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

51465

Op.getOperand(0).getValueType().is256BitVector() &&

51466

llvm::isNullConstant(Op.getOperand(1))) {

51467

Op = Op.getOperand(0);

51468

UseSubVector = true;

51469

}

51470

SmallVector<SDValue, 2> SrcOps;

51471

SmallVector<int, 16> SrcMask, ScaledMask;

51472

SDValue BC = peekThroughBitcasts(Op);

51473

if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

51474

!isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

51475

return Op.getValueSizeInBits() == BC.getValueSizeInBits();

51476

})) {

51477

resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

51478

if (!UseSubVector && SrcOps.size() <= 2 &&

51479

scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

51480

N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();

51481

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

51482

ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

51483

}

51484

if (UseSubVector && SrcOps.size() == 1 &&

51485

scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

51486

std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

51487

ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

51488

ShuffleMask.assign(Mask.begin(), Mask.end());

51489

}

51490

}

51491

};

51492

51493

// View LHS in the form

51494

// LHS = VECTOR_SHUFFLE A, B, LMask

51495

// If LHS is not a shuffle, then pretend it is the identity shuffle:

51496

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

51497

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

51498

SDValue A, B;

51499

SmallVector<int, 16> LMask;

51500

GetShuffle(LHS, A, B, LMask);

51501

51502

// Likewise, view RHS in the form

51503

// RHS = VECTOR_SHUFFLE C, D, RMask

51504

SDValue C, D;

51505

SmallVector<int, 16> RMask;

51506

GetShuffle(RHS, C, D, RMask);

51507

51508

// At least one of the operands should be a vector shuffle.

51509

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

51510

if (NumShuffles == 0)

51511

return false;

51512

51513

if (LMask.empty()) {

51514

A = LHS;

51515

for (unsigned i = 0; i != NumElts; ++i)

51516

LMask.push_back(i);

51517

}

51518

51519

if (RMask.empty()) {

51520

C = RHS;

51521

for (unsigned i = 0; i != NumElts; ++i)

51522

RMask.push_back(i);

51523

}

51524

51525

// If we have an unary mask, ensure the other op is set to null.

51526

if (isUndefOrInRange(LMask, 0, NumElts))

51527

B = SDValue();

51528

else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

51529

A = SDValue();

51530

51531

if (isUndefOrInRange(RMask, 0, NumElts))

51532

D = SDValue();

51533

else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

51534

C = SDValue();

51535

51536

// If A and B occur in reverse order in RHS, then canonicalize by commuting

51537

// RHS operands and shuffle mask.

51538

if (A != C) {

51539

std::swap(C, D);

51540

ShuffleVectorSDNode::commuteMask(RMask);

51541

}

51542

// Check that the shuffles are both shuffling the same vectors.

51543

if (!(A == C && B == D))

51544

return false;

51545

51546

PostShuffleMask.clear();

51547

PostShuffleMask.append(NumElts, SM_SentinelUndef);

51548

51549

// LHS and RHS are now:

51550

// LHS = shuffle A, B, LMask

51551

// RHS = shuffle A, B, RMask

51552

// Check that the masks correspond to performing a horizontal operation.

51553

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

51554

// so we just repeat the inner loop if this is a 256-bit op.

51555

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

51556

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

51557

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

51558

assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51559, __extension__
__PRETTY_FUNCTION__))

51559

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51559, __extension__
__PRETTY_FUNCTION__));

51560

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

51561

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

51562

// Ignore undefined components.

51563

int LIdx = LMask[i + j], RIdx = RMask[i + j];

51564

if (LIdx < 0 || RIdx < 0 ||

51565

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

51566

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

51567

continue;

51568

51569

// Check that successive odd/even elements are being operated on. If not,

51570

// this is not a horizontal operation.

51571

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

51572

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

51573

return false;

51574

51575

// Compute the post-shuffle mask index based on where the element

51576

// is stored in the HOP result, and where it needs to be moved to.

51577

int Base = LIdx & ~1u;

51578

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

51579

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

51580

51581

// The low half of the 128-bit result must choose from A.

51582

// The high half of the 128-bit result must choose from B,

51583

// unless B is undef. In that case, we are always choosing from A.

51584

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

51585

Index += NumEltsPer64BitChunk;

51586

PostShuffleMask[i + j] = Index;

51587

}

51588

}

51589

51590

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

51591

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

51592

51593

bool IsIdentityPostShuffle =

51594

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

51595

if (IsIdentityPostShuffle)

51596

PostShuffleMask.clear();

51597

51598

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

51599

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

51600

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

51601

return false;

51602

51603

// If the source nodes are already used in HorizOps then always accept this.

51604

// Shuffle folding should merge these back together.

51605

bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {

51606

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

51607

});

51608

bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {

51609

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

51610

});

51611

bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

51612

51613

// Assume a SingleSource HOP if we only shuffle one input and don't need to

51614

// shuffle the result.

51615

if (!ForceHorizOp &&

51616

!shouldUseHorizontalOp(NewLHS == NewRHS &&

51617

(NumShuffles < 2 || !IsIdentityPostShuffle),

51618

DAG, Subtarget))

51619

return false;

51620

51621

LHS = DAG.getBitcast(VT, NewLHS);

51622

RHS = DAG.getBitcast(VT, NewRHS);

51623

return true;

51624

}

51625

51626

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.

51627

static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

51628

const X86Subtarget &Subtarget) {

51629

EVT VT = N->getValueType(0);

51630

unsigned Opcode = N->getOpcode();

51631

bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

51632

SmallVector<int, 8> PostShuffleMask;

51633

51634

switch (Opcode) {

51635

case ISD::FADD:

51636

case ISD::FSUB:

51637

if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

51638

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

51639

SDValue LHS = N->getOperand(0);

51640

SDValue RHS = N->getOperand(1);

51641

auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

51642

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

51643

PostShuffleMask)) {

51644

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

51645

if (!PostShuffleMask.empty())

51646

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

51647

DAG.getUNDEF(VT), PostShuffleMask);

51648

return HorizBinOp;

51649

}

51650

}

51651

break;

51652

case ISD::ADD:

51653

case ISD::SUB:

51654

if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

51655

VT == MVT::v16i16 || VT == MVT::v8i32)) {

51656

SDValue LHS = N->getOperand(0);

51657

SDValue RHS = N->getOperand(1);

51658

auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

51659

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

51660

PostShuffleMask)) {

51661

auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

51662

ArrayRef<SDValue> Ops) {

51663

return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

51664

};

51665

SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

51666

{LHS, RHS}, HOpBuilder);

51667

if (!PostShuffleMask.empty())

51668

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

51669

DAG.getUNDEF(VT), PostShuffleMask);

51670

return HorizBinOp;

51671

}

51672

}

51673

break;

51674

}

51675

51676

return SDValue();

51677

}

51678

51679

// Try to combine the following nodes

51680

// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

51681

// <i32 -2147483648[float -0.000000e+00]> 0

51682

// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

51683

// <(load 4 from constant-pool)> t0, t29

51684

// [t30: v16i32 = bitcast t27]

51685

// t6: v16i32 = xor t7, t27[t30]

51686

// t11: v16f32 = bitcast t6

51687

// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

51688

// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

51689

// t22: v16f32 = bitcast t7

51690

// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

51691

// t24: v32f16 = bitcast t23

51692

static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

51693

const X86Subtarget &Subtarget) {

51694

EVT VT = N->getValueType(0);

51695

SDValue LHS = N->getOperand(0);

51696

SDValue RHS = N->getOperand(1);

51697

int CombineOpcode =

51698

N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

51699

auto isConjugationConstant = [](const Constant *c) {

51700

if (const auto *CI = dyn_cast<ConstantInt>(c)) {

51701

APInt ConjugationInt32 = APInt(32, 0x80000000, true);

51702

APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);

51703

switch (CI->getBitWidth()) {

51704

case 16:

51705

return false;

51706

case 32:

51707

return CI->getValue() == ConjugationInt32;

51708

case 64:

51709

return CI->getValue() == ConjugationInt64;

51710

default:

51711

llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51711);

51712

}

51713

}

51714

if (const auto *CF = dyn_cast<ConstantFP>(c))

51715

return CF->isNegativeZeroValue();

51716

return false;

51717

};

51718

auto combineConjugation = [&](SDValue &r) {

51719

if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {

51720

SDValue XOR = LHS.getOperand(0);

51721

if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {

51722

SDValue XORRHS = XOR.getOperand(1);

51723

if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())

51724

XORRHS = XORRHS.getOperand(0);

51725

if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&

51726

XORRHS.getOperand(1).getNumOperands()) {

51727

ConstantPoolSDNode *CP =

51728

dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));

51729

if (CP && isConjugationConstant(CP->getConstVal())) {

51730

SelectionDAG::FlagInserter FlagsInserter(DAG, N);

51731

SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

51732

SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

51733

r = DAG.getBitcast(VT, FCMulC);

51734

return true;

51735

}

51736

}

51737

}

51738

}

51739

return false;

51740

};

51741

SDValue Res;

51742

if (combineConjugation(Res))

51743

return Res;

51744

std::swap(LHS, RHS);

51745

if (combineConjugation(Res))

51746

return Res;

51747

return Res;

51748

}

51749

51750

// Try to combine the following nodes:

51751

// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)

51752

static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

51753

const X86Subtarget &Subtarget) {

51754

auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

51755

return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

51756

Flags.hasAllowContract();

51757

};

51758

51759

auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

51760

return DAG.getTarget().Options.NoSignedZerosFPMath ||

51761

Flags.hasNoSignedZeros();

51762

};

51763

auto IsVectorAllNegativeZero = [](const SDNode *N) {

51764

if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)

51765

return false;

51766

assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51767, __extension__
__PRETTY_FUNCTION__))

51767

"Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51767, __extension__
__PRETTY_FUNCTION__));

51768

if (ConstantPoolSDNode *CP =

51769

dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {

51770

APInt AI = APInt(32, 0x80008000, true);

51771

if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))

51772

return CI->getValue() == AI;

51773

if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))

51774

return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);

51775

}

51776

return false;

51777

};

51778

51779

if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

51780

!AllowContract(N->getFlags()))

51781

return SDValue();

51782

51783

EVT VT = N->getValueType(0);

51784

if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

51785

return SDValue();

51786

51787

SDValue LHS = N->getOperand(0);

51788

SDValue RHS = N->getOperand(1);

51789

bool IsConj;

51790

SDValue FAddOp1, MulOp0, MulOp1;

51791

auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

51792

&IsVectorAllNegativeZero,

51793

&HasNoSignedZero](SDValue N) -> bool {

51794

if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

51795

return false;

51796

SDValue Op0 = N.getOperand(0);

51797

unsigned Opcode = Op0.getOpcode();

51798

if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

51799

if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

51800

MulOp0 = Op0.getOperand(0);

51801

MulOp1 = Op0.getOperand(1);

51802

IsConj = Opcode == X86ISD::VFCMULC;

51803

return true;

51804

}

51805

if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

51806

((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

51807

HasNoSignedZero(Op0->getFlags())) ||

51808

IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {

51809

MulOp0 = Op0.getOperand(0);

51810

MulOp1 = Op0.getOperand(1);

51811

IsConj = Opcode == X86ISD::VFCMADDC;

51812

return true;

51813

}

51814

}

51815

return false;

51816

};

51817

51818

if (GetCFmulFrom(LHS))

51819

FAddOp1 = RHS;

51820

else if (GetCFmulFrom(RHS))

51821

FAddOp1 = LHS;

51822

else

51823

return SDValue();

51824

51825

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

51826

FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

51827

unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

51828

// FIXME: How do we handle when fast math flags of FADD are different from

51829

// CFMUL's?

51830

SDValue CFmul =

51831

DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

51832

return DAG.getBitcast(VT, CFmul);

51833

}

51834

51835

/// Do target-specific dag combines on floating-point adds/subs.

51836

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

51837

const X86Subtarget &Subtarget) {

51838

if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

51839

return HOp;

51840

51841

if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

51842

return COp;

51843

51844

return SDValue();

51845

}

51846

51847

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

51848

/// the codegen.

51849

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

51850

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

51851

/// anything that is guaranteed to be transformed by DAGCombiner.

51852

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

51853

const X86Subtarget &Subtarget,

51854

const SDLoc &DL) {

51855

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51855, __extension__
__PRETTY_FUNCTION__));

51856

SDValue Src = N->getOperand(0);

51857

unsigned SrcOpcode = Src.getOpcode();

51858

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51859

51860

EVT VT = N->getValueType(0);

51861

EVT SrcVT = Src.getValueType();

51862

51863

auto IsFreeTruncation = [VT](SDValue Op) {

51864

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

51865

51866

// See if this has been extended from a smaller/equal size to

51867

// the truncation size, allowing a truncation to combine with the extend.

51868

unsigned Opcode = Op.getOpcode();

51869

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

51870

Opcode == ISD::ZERO_EXTEND) &&

51871

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

51872

return true;

51873

51874

// See if this is a single use constant which can be constant folded.

51875

// NOTE: We don't peek throught bitcasts here because there is currently

51876

// no support for constant folding truncate+bitcast+vector_of_constants. So

51877

// we'll just send up with a truncate on both operands which will

51878

// get turned back into (truncate (binop)) causing an infinite loop.

51879

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

51880

};

51881

51882

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

51883

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

51884

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

51885

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

51886

};

51887

51888

// Don't combine if the operation has other uses.

51889

if (!Src.hasOneUse())

51890

return SDValue();

51891

51892

// Only support vector truncation for now.

51893

// TODO: i64 scalar math would benefit as well.

51894

if (!VT.isVector())

51895

return SDValue();

51896

51897

// In most cases its only worth pre-truncating if we're only facing the cost

51898

// of one truncation.

51899

// i.e. if one of the inputs will constant fold or the input is repeated.

51900

switch (SrcOpcode) {

51901

case ISD::MUL:

51902

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

51903

// better to truncate if we have the chance.

51904

if (SrcVT.getScalarType() == MVT::i64 &&

51905

TLI.isOperationLegal(SrcOpcode, VT) &&

51906

!TLI.isOperationLegal(SrcOpcode, SrcVT))

51907

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

51908

[[fallthrough]];

51909

case ISD::AND:

51910

case ISD::XOR:

51911

case ISD::OR:

51912

case ISD::ADD:

51913

case ISD::SUB: {

51914

SDValue Op0 = Src.getOperand(0);

51915

SDValue Op1 = Src.getOperand(1);

51916

if (TLI.isOperationLegal(SrcOpcode, VT) &&

51917

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

51918

return TruncateArithmetic(Op0, Op1);

51919

break;

51920

}

51921

}

51922

51923

return SDValue();

51924

}

51925

51926

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

51927

/// e.g. trunc <8 x i32> X to <8 x i16> -->

51928

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

51929

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

51930

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

51931

const X86Subtarget &Subtarget,

51932

SelectionDAG &DAG) {

51933

SDValue In = N->getOperand(0);

51934

EVT InVT = In.getValueType();

51935

EVT OutVT = N->getValueType(0);

51936

51937

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

51938

OutVT.getScalarSizeInBits());

51939

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

51940

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

51941

}

51942

51943

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

51944

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

51945

const X86Subtarget &Subtarget,

51946

SelectionDAG &DAG) {

51947

SDValue In = N->getOperand(0);

51948

EVT InVT = In.getValueType();

51949

EVT OutVT = N->getValueType(0);

51950

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

51951

DAG.getValueType(OutVT));

51952

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

51953

}

51954

51955

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

51956

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

51957

/// legalization the truncation will be translated into a BUILD_VECTOR with each

51958

/// element that is extracted from a vector and then truncated, and it is

51959

/// difficult to do this optimization based on them.

51960

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

51961

const X86Subtarget &Subtarget) {

51962

EVT OutVT = N->getValueType(0);

51963

if (!OutVT.isVector())

51964

return SDValue();

51965

51966

SDValue In = N->getOperand(0);

51967

if (!In.getValueType().isSimple())

51968

return SDValue();

51969

51970

EVT InVT = In.getValueType();

51971

unsigned NumElems = OutVT.getVectorNumElements();

51972

51973

// AVX512 provides fast truncate ops.

51974

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

51975

return SDValue();

51976

51977

EVT OutSVT = OutVT.getVectorElementType();

51978

EVT InSVT = InVT.getVectorElementType();

51979

if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&

51980

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

51981

NumElems >= 8))

51982

return SDValue();

51983

51984

// SSSE3's pshufb results in less instructions in the cases below.

51985

if (Subtarget.hasSSSE3() && NumElems == 8) {

51986

if (InSVT == MVT::i16)

51987

return SDValue();

51988

if (InSVT == MVT::i32 &&

51989

(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))

51990

return SDValue();

51991

}

51992

51993

SDLoc DL(N);

51994

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

51995

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

51996

// truncate 2 x v4i32 to v8i16.

51997

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

51998

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

51999

if (InSVT == MVT::i32)

52000

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

52001

52002

return SDValue();

52003

}

52004

52005

/// This function transforms vector truncation of 'extended sign-bits' or

52006

/// 'extended zero-bits' values.

52007

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

52008

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

52009

SelectionDAG &DAG,

52010

const X86Subtarget &Subtarget) {

52011

// Requires SSE2.

52012

if (!Subtarget.hasSSE2())

52013

return SDValue();

52014

52015

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

52016

return SDValue();

52017

52018

SDValue In = N->getOperand(0);

52019

if (!In.getValueType().isSimple())

52020

return SDValue();

52021

52022

MVT VT = N->getValueType(0).getSimpleVT();

52023

MVT SVT = VT.getScalarType();

52024

52025

MVT InVT = In.getValueType().getSimpleVT();

52026

MVT InSVT = InVT.getScalarType();

52027

52028

// Check we have a truncation suited for PACKSS/PACKUS.

52029

if (!isPowerOf2_32(VT.getVectorNumElements()))

52030

return SDValue();

52031

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

52032

return SDValue();

52033

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

52034

return SDValue();

52035

52036

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

52037

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

52038

return SDValue();

52039

52040

// AVX512 has fast truncate, but if the input is already going to be split,

52041

// there's no harm in trying pack.

52042

if (Subtarget.hasAVX512() &&

52043

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

52044

InVT.is512BitVector())) {

52045

// PACK should still be worth it for 128-bit vectors if the sources were

52046

// originally concatenated from subvectors.

52047

SmallVector<SDValue> ConcatOps;

52048

if (VT.getSizeInBits() > 128 ||

52049

!collectConcatOps(In.getNode(), ConcatOps, DAG))

52050

return SDValue();

52051

}

52052

52053

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

52054

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

52055

52056

// Use PACKUS if the input has zero-bits that extend all the way to the

52057

// packed/truncated value. e.g. masks, zext_in_reg, etc.

52058

KnownBits Known = DAG.computeKnownBits(In);

52059

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

52060

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

52061

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

52062

52063

// Use PACKSS if the input has sign-bits that extend all the way to the

52064

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

52065

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

52066

52067

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

52068

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

52069

// on and combines/simplifications can't then use it.

52070

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

52071

return SDValue();

52072

52073

unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;

52074

if (NumSignBits > MinSignBits)

52075

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

52076

52077

// If we have a srl that only generates signbits that we will discard in

52078

// the truncation then we can use PACKSS by converting the srl to a sra.

52079

// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

52080

if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))

52081

if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(

52082

In, APInt::getAllOnes(VT.getVectorNumElements()))) {

52083

if (*ShAmt == MinSignBits) {

52084

SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());

52085

return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,

52086

Subtarget);

52087

}

52088

}

52089

52090

return SDValue();

52091

}

52092

52093

// Try to form a MULHU or MULHS node by looking for

52094

// (trunc (srl (mul ext, ext), 16))

52095

// TODO: This is X86 specific because we want to be able to handle wide types

52096

// before type legalization. But we can only do it if the vector will be

52097

// legalized via widening/splitting. Type legalization can't handle promotion

52098

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

52099

// combiner.

52100

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

52101

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

52102

// First instruction should be a right shift of a multiply.

52103

if (Src.getOpcode() != ISD::SRL ||

52104

Src.getOperand(0).getOpcode() != ISD::MUL)

52105

return SDValue();

52106

52107

if (!Subtarget.hasSSE2())

52108

return SDValue();

52109

52110

// Only handle vXi16 types that are at least 128-bits unless they will be

52111

// widened.

52112

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

52113

return SDValue();

52114

52115

// Input type should be at least vXi32.

52116

EVT InVT = Src.getValueType();

52117

if (InVT.getVectorElementType().getSizeInBits() < 32)

52118

return SDValue();

52119

52120

// Need a shift by 16.

52121

APInt ShiftAmt;

52122

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

52123

ShiftAmt != 16)

52124

return SDValue();

52125

52126

SDValue LHS = Src.getOperand(0).getOperand(0);

52127

SDValue RHS = Src.getOperand(0).getOperand(1);

52128

52129

// Count leading sign/zero bits on both inputs - if there are enough then

52130

// truncation back to vXi16 will be cheap - either as a pack/shuffle

52131

// sequence or using AVX512 truncations. If the inputs are sext/zext then the

52132

// truncations may actually be free by peeking through to the ext source.

52133

auto IsSext = [&DAG](SDValue V) {

52134

return DAG.ComputeMaxSignificantBits(V) <= 16;

52135

};

52136

auto IsZext = [&DAG](SDValue V) {

52137

return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

52138

};

52139

52140

bool IsSigned = IsSext(LHS) && IsSext(RHS);

52141

bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

52142

if (!IsSigned && !IsUnsigned)

52143

return SDValue();

52144

52145

// Check if both inputs are extensions, which will be removed by truncation.

52146

bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||

52147

LHS.getOpcode() == ISD::ZERO_EXTEND) &&

52148

(RHS.getOpcode() == ISD::SIGN_EXTEND ||

52149

RHS.getOpcode() == ISD::ZERO_EXTEND) &&

52150

LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&

52151

RHS.getOperand(0).getScalarValueSizeInBits() <= 16;

52152

52153

// For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

52154

// the (bitcasted) inputs directly, and then cheaply pack/truncate the result

52155

// (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

52156

// will have to split anyway.

52157

unsigned InSizeInBits = InVT.getSizeInBits();

52158

if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

52159

!(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

52160

(InSizeInBits % 16) == 0) {

52161

EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52162

InVT.getSizeInBits() / 16);

52163

SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

52164

DAG.getBitcast(BCVT, RHS));

52165

return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

52166

}

52167

52168

// Truncate back to source type.

52169

LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

52170

RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);

52171

52172

unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

52173

return DAG.getNode(Opc, DL, VT, LHS, RHS);

52174

}

52175

52176

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

52177

// from one vector with signed bytes from another vector, adds together

52178

// adjacent pairs of 16-bit products, and saturates the result before

52179

// truncating to 16-bits.

52180

//

52181

// Which looks something like this:

52182

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

52183

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

52184

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

52185

const X86Subtarget &Subtarget,

52186

const SDLoc &DL) {

52187

if (!VT.isVector() || !Subtarget.hasSSSE3())

52188

return SDValue();

52189

52190

unsigned NumElems = VT.getVectorNumElements();

52191

EVT ScalarVT = VT.getVectorElementType();

52192

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

52193

return SDValue();

52194

52195

SDValue SSatVal = detectSSatPattern(In, VT);

52196

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

52197

return SDValue();

52198

52199

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

52200

// of multiplies from even/odd elements.

52201

SDValue N0 = SSatVal.getOperand(0);

52202

SDValue N1 = SSatVal.getOperand(1);

52203

52204

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

52205

return SDValue();

52206

52207

SDValue N00 = N0.getOperand(0);

52208

SDValue N01 = N0.getOperand(1);

52209

SDValue N10 = N1.getOperand(0);

52210

SDValue N11 = N1.getOperand(1);

52211

52212

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

52213

// Canonicalize zero_extend to LHS.

52214

if (N01.getOpcode() == ISD::ZERO_EXTEND)

52215

std::swap(N00, N01);

52216

if (N11.getOpcode() == ISD::ZERO_EXTEND)

52217

std::swap(N10, N11);

52218

52219

// Ensure we have a zero_extend and a sign_extend.

52220

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

52221

N01.getOpcode() != ISD::SIGN_EXTEND ||

52222

N10.getOpcode() != ISD::ZERO_EXTEND ||

52223

N11.getOpcode() != ISD::SIGN_EXTEND)

52224

return SDValue();

52225

52226

// Peek through the extends.

52227

N00 = N00.getOperand(0);

52228

N01 = N01.getOperand(0);

52229

N10 = N10.getOperand(0);

52230

N11 = N11.getOperand(0);

52231

52232

// Ensure the extend is from vXi8.

52233

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

52234

N01.getValueType().getVectorElementType() != MVT::i8 ||

52235

N10.getValueType().getVectorElementType() != MVT::i8 ||

52236

N11.getValueType().getVectorElementType() != MVT::i8)

52237

return SDValue();

52238

52239

// All inputs should be build_vectors.

52240

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

52241

N01.getOpcode() != ISD::BUILD_VECTOR ||

52242

N10.getOpcode() != ISD::BUILD_VECTOR ||

52243

N11.getOpcode() != ISD::BUILD_VECTOR)

52244

return SDValue();

52245

52246

// N00/N10 are zero extended. N01/N11 are sign extended.

52247

52248

// For each element, we need to ensure we have an odd element from one vector

52249

// multiplied by the odd element of another vector and the even element from

52250

// one of the same vectors being multiplied by the even element from the

52251

// other vector. So we need to make sure for each element i, this operator

52252

// is being performed:

52253

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

52254

SDValue ZExtIn, SExtIn;

52255

for (unsigned i = 0; i != NumElems; ++i) {

52256

SDValue N00Elt = N00.getOperand(i);

52257

SDValue N01Elt = N01.getOperand(i);

52258

SDValue N10Elt = N10.getOperand(i);

52259

SDValue N11Elt = N11.getOperand(i);

52260

// TODO: Be more tolerant to undefs.

52261

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52262

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52263

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52264

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

52265

return SDValue();

52266

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

52267

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

52268

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

52269

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

52270

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

52271

return SDValue();

52272

unsigned IdxN00 = ConstN00Elt->getZExtValue();

52273

unsigned IdxN01 = ConstN01Elt->getZExtValue();

52274

unsigned IdxN10 = ConstN10Elt->getZExtValue();

52275

unsigned IdxN11 = ConstN11Elt->getZExtValue();

52276

// Add is commutative so indices can be reordered.

52277

if (IdxN00 > IdxN10) {

52278

std::swap(IdxN00, IdxN10);

52279

std::swap(IdxN01, IdxN11);

52280

}

52281

// N0 indices be the even element. N1 indices must be the next odd element.

52282

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

52283

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

52284

return SDValue();

52285

SDValue N00In = N00Elt.getOperand(0);

52286

SDValue N01In = N01Elt.getOperand(0);

52287

SDValue N10In = N10Elt.getOperand(0);

52288

SDValue N11In = N11Elt.getOperand(0);

52289

// First time we find an input capture it.

52290

if (!ZExtIn) {

52291

ZExtIn = N00In;

52292

SExtIn = N01In;

52293

}

52294

if (ZExtIn != N00In || SExtIn != N01In ||

52295

ZExtIn != N10In || SExtIn != N11In)

52296

return SDValue();

52297

}

52298

52299

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

52300

ArrayRef<SDValue> Ops) {

52301

// Shrink by adding truncate nodes and let DAGCombine fold with the

52302

// sources.

52303

EVT InVT = Ops[0].getValueType();

52304

assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52305, __extension__
__PRETTY_FUNCTION__))

52305

"Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52305, __extension__
__PRETTY_FUNCTION__));

52306

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52306, __extension__
__PRETTY_FUNCTION__));

52307

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52308

InVT.getVectorNumElements() / 2);

52309

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

52310

};

52311

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

52312

PMADDBuilder);

52313

}

52314

52315

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

52316

const X86Subtarget &Subtarget) {

52317

EVT VT = N->getValueType(0);

52318

SDValue Src = N->getOperand(0);

52319

SDLoc DL(N);

52320

52321

// Attempt to pre-truncate inputs to arithmetic ops instead.

52322

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

52323

return V;

52324

52325

// Try to detect AVG pattern first.

52326

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

52327

return Avg;

52328

52329

// Try to detect PMADD

52330

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

52331

return PMAdd;

52332

52333

// Try to combine truncation with signed/unsigned saturation.

52334

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

52335

return Val;

52336

52337

// Try to combine PMULHUW/PMULHW for vXi16.

52338

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

52339

return V;

52340

52341

// The bitcast source is a direct mmx result.

52342

// Detect bitcasts between i32 to x86mmx

52343

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

52344

SDValue BCSrc = Src.getOperand(0);

52345

if (BCSrc.getValueType() == MVT::x86mmx)

52346

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

52347

}

52348

52349

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

52350

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

52351

return V;

52352

52353

return combineVectorTruncation(N, DAG, Subtarget);

52354

}

52355

52356

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

52357

TargetLowering::DAGCombinerInfo &DCI) {

52358

EVT VT = N->getValueType(0);

52359

SDValue In = N->getOperand(0);

52360

SDLoc DL(N);

52361

52362

if (SDValue SSatVal = detectSSatPattern(In, VT))

52363

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

52364

if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))

52365

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

52366

52367

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52368

APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

52369

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

52370

return SDValue(N, 0);

52371

52372

return SDValue();

52373

}

52374

52375

/// Returns the negated value if the node \p N flips sign of FP value.

52376

///

52377

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

52378

/// or FSUB(0, x)

52379

/// AVX512F does not have FXOR, so FNEG is lowered as

52380

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

52381

/// In this case we go though all bitcasts.

52382

/// This also recognizes splat of a negated value and returns the splat of that

52383

/// value.

52384

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

52385

if (N->getOpcode() == ISD::FNEG)

52386

return N->getOperand(0);

52387

52388

// Don't recurse exponentially.

52389

if (Depth > SelectionDAG::MaxRecursionDepth)

52390

return SDValue();

52391

52392

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

52393

52394

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

52395

EVT VT = Op->getValueType(0);

52396

52397

// Make sure the element size doesn't change.

52398

if (VT.getScalarSizeInBits() != ScalarSize)

52399

return SDValue();

52400

52401

unsigned Opc = Op.getOpcode();

52402

switch (Opc) {

52403

case ISD::VECTOR_SHUFFLE: {

52404

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

52405

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

52406

if (!Op.getOperand(1).isUndef())

52407

return SDValue();

52408

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

52409

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

52410

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

52411

cast<ShuffleVectorSDNode>(Op)->getMask());

52412

break;

52413

}

52414

case ISD::INSERT_VECTOR_ELT: {

52415

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

52416

// -V, INDEX).

52417

SDValue InsVector = Op.getOperand(0);

52418

SDValue InsVal = Op.getOperand(1);

52419

if (!InsVector.isUndef())

52420

return SDValue();

52421

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

52422

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

52423

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

52424

NegInsVal, Op.getOperand(2));

52425

break;

52426

}

52427

case ISD::FSUB:

52428

case ISD::XOR:

52429

case X86ISD::FXOR: {

52430

SDValue Op1 = Op.getOperand(1);

52431

SDValue Op0 = Op.getOperand(0);

52432

52433

// For XOR and FXOR, we want to check if constant

52434

// bits of Op1 are sign bit masks. For FSUB, we

52435

// have to check if constant bits of Op0 are sign

52436

// bit masks and hence we swap the operands.

52437

if (Opc == ISD::FSUB)

52438

std::swap(Op0, Op1);

52439

52440

APInt UndefElts;

52441

SmallVector<APInt, 16> EltBits;

52442

// Extract constant bits and see if they are all

52443

// sign bit masks. Ignore the undef elements.

52444

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

52445

/* AllowWholeUndefs */ true,

52446

/* AllowPartialUndefs */ false)) {

52447

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

52448

if (!UndefElts[I] && !EltBits[I].isSignMask())

52449

return SDValue();

52450

52451

// Only allow bitcast from correctly-sized constant.

52452

Op0 = peekThroughBitcasts(Op0);

52453

if (Op0.getScalarValueSizeInBits() == ScalarSize)

52454

return Op0;

52455

}

52456

break;

52457

} // case

52458

} // switch

52459

52460

return SDValue();

52461

}

52462

52463

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

52464

bool NegRes) {

52465

if (NegMul) {

52466

switch (Opcode) {

52467

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52467);

52468

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

52469

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

52470

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

52471

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

52472

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

52473

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

52474

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

52475

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

52476

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

52477

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

52478

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

52479

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

52480

}

52481

}

52482

52483

if (NegAcc) {

52484

switch (Opcode) {

52485

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52485);

52486

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

52487

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

52488

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

52489

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

52490

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

52491

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

52492

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

52493

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

52494

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

52495

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

52496

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

52497

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

52498

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

52499

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

52500

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

52501

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

52502

}

52503

}

52504

52505

if (NegRes) {

52506

switch (Opcode) {

52507

// For accuracy reason, we never combine fneg and fma under strict FP.

52508

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52508);

52509

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

52510

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

52511

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

52512

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

52513

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

52514

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

52515

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

52516

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

52517

}

52518

}

52519

52520

return Opcode;

52521

}

52522

52523

/// Do target-specific dag combines on floating point negations.

52524

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

52525

TargetLowering::DAGCombinerInfo &DCI,

52526

const X86Subtarget &Subtarget) {

52527

EVT OrigVT = N->getValueType(0);

52528

SDValue Arg = isFNEG(DAG, N);

52529

if (!Arg)

52530

return SDValue();

52531

52532

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52533

EVT VT = Arg.getValueType();

52534

EVT SVT = VT.getScalarType();

52535

SDLoc DL(N);

52536

52537

// Let legalize expand this if it isn't a legal type yet.

52538

if (!TLI.isTypeLegal(VT))

52539

return SDValue();

52540

52541

// If we're negating a FMUL node on a target with FMA, then we can avoid the

52542

// use of a constant by performing (-0 - A*B) instead.

52543

// FIXME: Check rounding control flags as well once it becomes available.

52544

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

52545

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

52546

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

52547

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

52548

Arg.getOperand(1), Zero);

52549

return DAG.getBitcast(OrigVT, NewNode);

52550

}

52551

52552

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

52553

bool LegalOperations = !DCI.isBeforeLegalizeOps();

52554

if (SDValue NegArg =

52555

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

52556

return DAG.getBitcast(OrigVT, NegArg);

52557

52558

return SDValue();

52559

}

52560

52561

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

52562

bool LegalOperations,

52563

bool ForCodeSize,

52564

NegatibleCost &Cost,

52565

unsigned Depth) const {

52566

// fneg patterns are removable even if they have multiple uses.

52567

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

52568

Cost = NegatibleCost::Cheaper;

52569

return DAG.getBitcast(Op.getValueType(), Arg);

52570

}

52571

52572

EVT VT = Op.getValueType();

52573

EVT SVT = VT.getScalarType();

52574

unsigned Opc = Op.getOpcode();

52575

SDNodeFlags Flags = Op.getNode()->getFlags();

52576

switch (Opc) {

52577

case ISD::FMA:

52578

case X86ISD::FMSUB:

52579

case X86ISD::FNMADD:

52580

case X86ISD::FNMSUB:

52581

case X86ISD::FMADD_RND:

52582

case X86ISD::FMSUB_RND:

52583

case X86ISD::FNMADD_RND:

52584

case X86ISD::FNMSUB_RND: {

52585

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

52586

!(SVT == MVT::f32 || SVT == MVT::f64) ||

52587

!isOperationLegal(ISD::FMA, VT))

52588

break;

52589

52590

// Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

52591

// if it may have signed zeros.

52592

if (!Flags.hasNoSignedZeros())

52593

break;

52594

52595

// This is always negatible for free but we might be able to remove some

52596

// extra operand negations as well.

52597

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

52598

for (int i = 0; i != 3; ++i)

52599

NewOps[i] = getCheaperNegatedExpression(

52600

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

52601

52602

bool NegA = !!NewOps[0];

52603

bool NegB = !!NewOps[1];

52604

bool NegC = !!NewOps[2];

52605

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

52606

52607

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

52608

: NegatibleCost::Neutral;

52609

52610

// Fill in the non-negated ops with the original values.

52611

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

52612

if (!NewOps[i])

52613

NewOps[i] = Op.getOperand(i);

52614

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

52615

}

52616

case X86ISD::FRCP:

52617

if (SDValue NegOp0 =

52618

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

52619

ForCodeSize, Cost, Depth + 1))

52620

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

52621

break;

52622

}

52623

52624

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

52625

ForCodeSize, Cost, Depth);

52626

}

52627

52628

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

52629

const X86Subtarget &Subtarget) {

52630

MVT VT = N->getSimpleValueType(0);

52631

// If we have integer vector types available, use the integer opcodes.

52632

if (!VT.isVector() || !Subtarget.hasSSE2())

52633

return SDValue();

52634

52635

SDLoc dl(N);

52636

52637

unsigned IntBits = VT.getScalarSizeInBits();

52638

MVT IntSVT = MVT::getIntegerVT(IntBits);

52639

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

52640

52641

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

52642

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

52643

unsigned IntOpcode;

52644

switch (N->getOpcode()) {

52645

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52645);

52646

case X86ISD::FOR: IntOpcode = ISD::OR; break;

52647

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

52648

case X86ISD::FAND: IntOpcode = ISD::AND; break;

52649

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

52650

}

52651

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

52652

return DAG.getBitcast(VT, IntOp);

52653

}

52654

52655

52656

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

52657

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

52658

if (N->getOpcode() != ISD::XOR)

52659

return SDValue();

52660

52661

SDValue LHS = N->getOperand(0);

52662

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

52663

return SDValue();

52664

52665

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

52666

X86::CondCode(LHS->getConstantOperandVal(0)));

52667

SDLoc DL(N);

52668

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

52669

}

52670

52671

static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,

52672

const X86Subtarget &Subtarget) {

52673

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52674, __extension__
__PRETTY_FUNCTION__))

52674

"Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52674, __extension__
__PRETTY_FUNCTION__));

52675

if (Subtarget.hasFastLZCNT())

52676

return SDValue();

52677

52678

EVT VT = N->getValueType(0);

52679

if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&

52680

(VT != MVT::i64 || !Subtarget.is64Bit()))

52681

return SDValue();

52682

52683

SDValue N0 = N->getOperand(0);

52684

SDValue N1 = N->getOperand(1);

52685

52686

if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&

52687

N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)

52688

return SDValue();

52689

52690

SDValue OpCTLZ;

52691

SDValue OpSizeTM1;

52692

52693

if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {

52694

OpCTLZ = N1;

52695

OpSizeTM1 = N0;

52696

} else if (N->getOpcode() == ISD::SUB) {

52697

return SDValue();

52698

} else {

52699

OpCTLZ = N0;

52700

OpSizeTM1 = N1;

52701

}

52702

52703

if (!OpCTLZ.hasOneUse())

52704

return SDValue();

52705

auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);

52706

if (!C)

52707

return SDValue();

52708

52709

if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))

52710

return SDValue();

52711

SDLoc DL(N);

52712

EVT OpVT = VT;

52713

SDValue Op = OpCTLZ.getOperand(0);

52714

if (VT == MVT::i8) {

52715

// Zero extend to i32 since there is not an i8 bsr.

52716

OpVT = MVT::i32;

52717

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);

52718

}

52719

52720

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

52721

Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);

52722

if (VT == MVT::i8)

52723

Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);

52724

52725

return Op;

52726

}

52727

52728

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

52729

TargetLowering::DAGCombinerInfo &DCI,

52730

const X86Subtarget &Subtarget) {

52731

SDValue N0 = N->getOperand(0);

52732

SDValue N1 = N->getOperand(1);

52733

EVT VT = N->getValueType(0);

52734

52735

// If this is SSE1 only convert to FXOR to avoid scalarization.

52736

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

52737

return DAG.getBitcast(MVT::v4i32,

52738

DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

52739

DAG.getBitcast(MVT::v4f32, N0),

52740

DAG.getBitcast(MVT::v4f32, N1)));

52741

}

52742

52743

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

52744

return Cmp;

52745

52746

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

52747

return R;

52748

52749

if (SDValue R = combineBitOpWithShift(N, DAG))

52750

return R;

52751

52752

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

52753

return FPLogic;

52754

52755

if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))

52756

return R;

52757

52758

if (DCI.isBeforeLegalizeOps())

52759

return SDValue();

52760

52761

if (SDValue SetCC = foldXor1SetCC(N, DAG))

52762

return SetCC;

52763

52764

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

52765

return R;

52766

52767

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

52768

return RV;

52769

52770

// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

52771

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52772

if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

52773

N0.getOperand(0).getValueType().isVector() &&

52774

N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

52775

TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

52776

return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),

52777

N0.getOperand(0).getValueType()));

52778

}

52779

52780

// Handle AVX512 mask widening.

52781

// Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

52782

if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

52783

VT.getVectorElementType() == MVT::i1 &&

52784

N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

52785

TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

52786

return DAG.getNode(

52787

ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),

52788

DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),

52789

N0.getOperand(2));

52790

}

52791

52792

// Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

52793

// Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

52794

// TODO: Under what circumstances could this be performed in DAGCombine?

52795

if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

52796

N0.getOperand(0).getOpcode() == N->getOpcode()) {

52797

SDValue TruncExtSrc = N0.getOperand(0);

52798

auto *N1C = dyn_cast<ConstantSDNode>(N1);

52799

auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

52800

if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

52801

SDLoc DL(N);

52802

SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

52803

SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

52804

return DAG.getNode(ISD::XOR, DL, VT, LHS,

52805

DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

52806

}

52807

}

52808

52809

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

52810

return R;

52811

52812

return combineFneg(N, DAG, DCI, Subtarget);

52813

}

52814

52815

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

52816

TargetLowering::DAGCombinerInfo &DCI,

52817

const X86Subtarget &Subtarget) {

52818

EVT VT = N->getValueType(0);

52819

unsigned NumBits = VT.getSizeInBits();

52820

52821

// TODO - Constant Folding.

52822

52823

// Simplify the inputs.

52824

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52825

APInt DemandedMask(APInt::getAllOnes(NumBits));

52826

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

52827

return SDValue(N, 0);

52828

52829

return SDValue();

52830

}

52831

52832

static bool isNullFPScalarOrVectorConst(SDValue V) {

52833

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

52834

}

52835

52836

/// If a value is a scalar FP zero or a vector FP zero (potentially including

52837

/// undefined elements), return a zero constant that may be used to fold away

52838

/// that value. In the case of a vector, the returned constant will not contain

52839

/// undefined elements even if the input parameter does. This makes it suitable

52840

/// to be used as a replacement operand with operations (eg, bitwise-and) where

52841

/// an undef should not propagate.

52842

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

52843

const X86Subtarget &Subtarget) {

52844

if (!isNullFPScalarOrVectorConst(V))

52845

return SDValue();

52846

52847

if (V.getValueType().isVector())

52848

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

52849

52850

return V;

52851

}

52852

52853

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

52854

const X86Subtarget &Subtarget) {

52855

SDValue N0 = N->getOperand(0);

52856

SDValue N1 = N->getOperand(1);

52857

EVT VT = N->getValueType(0);

52858

SDLoc DL(N);

52859

52860

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

52861

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

52862

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

52863

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

52864

return SDValue();

52865

52866

auto isAllOnesConstantFP = [](SDValue V) {

52867

if (V.getSimpleValueType().isVector())

52868

return ISD::isBuildVectorAllOnes(V.getNode());

52869

auto *C = dyn_cast<ConstantFPSDNode>(V);

52870

return C && C->getConstantFPValue()->isAllOnesValue();

52871

};

52872

52873

// fand (fxor X, -1), Y --> fandn X, Y

52874

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

52875

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

52876

52877

// fand X, (fxor Y, -1) --> fandn Y, X

52878

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

52879

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

52880

52881

return SDValue();

52882

}

52883

52884

/// Do target-specific dag combines on X86ISD::FAND nodes.

52885

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

52886

const X86Subtarget &Subtarget) {

52887

// FAND(0.0, x) -> 0.0

52888

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

52889

return V;

52890

52891

// FAND(x, 0.0) -> 0.0

52892

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

52893

return V;

52894

52895

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

52896

return V;

52897

52898

return lowerX86FPLogicOp(N, DAG, Subtarget);

52899

}

52900

52901

/// Do target-specific dag combines on X86ISD::FANDN nodes.

52902

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

52903

const X86Subtarget &Subtarget) {

52904

// FANDN(0.0, x) -> x

52905

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

52906

return N->getOperand(1);

52907

52908

// FANDN(x, 0.0) -> 0.0

52909

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

52910

return V;

52911

52912

return lowerX86FPLogicOp(N, DAG, Subtarget);

52913

}

52914

52915

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

52916

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

52917

TargetLowering::DAGCombinerInfo &DCI,

52918

const X86Subtarget &Subtarget) {

52919

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52919, __extension__
__PRETTY_FUNCTION__));

52920

52921

// F[X]OR(0.0, x) -> x

52922

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

52923

return N->getOperand(1);

52924

52925

// F[X]OR(x, 0.0) -> x

52926

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

52927

return N->getOperand(0);

52928

52929

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

52930

return NewVal;

52931

52932

return lowerX86FPLogicOp(N, DAG, Subtarget);

52933

}

52934

52935

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

52936

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

52937

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52937, __extension__
__PRETTY_FUNCTION__));

52938

52939

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

52940

if (!DAG.getTarget().Options.NoNaNsFPMath ||

52941

!DAG.getTarget().Options.NoSignedZerosFPMath)

52942

return SDValue();

52943

52944

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

52945

// into FMINC and FMAXC, which are Commutative operations.

52946

unsigned NewOp = 0;

52947

switch (N->getOpcode()) {

52948

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52948);

52949

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

52950

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

52951

}

52952

52953

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

52954

N->getOperand(0), N->getOperand(1));

52955

}

52956

52957

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

52958

const X86Subtarget &Subtarget) {

52959

EVT VT = N->getValueType(0);

52960

if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))

52961

return SDValue();

52962

52963

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52964

52965

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

52966

(Subtarget.hasSSE2() && VT == MVT::f64) ||

52967

(Subtarget.hasFP16() && VT == MVT::f16) ||

52968

(VT.isVector() && TLI.isTypeLegal(VT))))

52969

return SDValue();

52970

52971

SDValue Op0 = N->getOperand(0);

52972

SDValue Op1 = N->getOperand(1);

52973

SDLoc DL(N);

52974

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

52975

52976

// If we don't have to respect NaN inputs, this is a direct translation to x86

52977

// min/max instructions.

52978

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

52979

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

52980

52981

// If one of the operands is known non-NaN use the native min/max instructions

52982

// with the non-NaN input as second operand.

52983

if (DAG.isKnownNeverNaN(Op1))

52984

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

52985

if (DAG.isKnownNeverNaN(Op0))

52986

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

52987

52988

// If we have to respect NaN inputs, this takes at least 3 instructions.

52989

// Favor a library call when operating on a scalar and minimizing code size.

52990

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

52991

return SDValue();

52992

52993

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

52994

VT);

52995

52996

// There are 4 possibilities involving NaN inputs, and these are the required

52997

// outputs:

52998

// Op1

52999

// Num NaN

53000

// ----------------

53001

// Num | Max | Op0 |

53002

// Op0 ----------------

53003

// NaN | Op1 | NaN |

53004

// ----------------

53005

//

53006

// The SSE FP max/min instructions were not designed for this case, but rather

53007

// to implement:

53008

// Min = Op1 < Op0 ? Op1 : Op0

53009

// Max = Op1 > Op0 ? Op1 : Op0

53010

//

53011

// So they always return Op0 if either input is a NaN. However, we can still

53012

// use those instructions for fmaxnum by selecting away a NaN input.

53013

53014

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

53015

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

53016

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

53017

53018

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

53019

// are NaN, the NaN value of Op1 is the result.

53020

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

53021

}

53022

53023

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

53024

TargetLowering::DAGCombinerInfo &DCI) {

53025

EVT VT = N->getValueType(0);

53026

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53027

53028

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

53029

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

53030

return SDValue(N, 0);

53031

53032

// Convert a full vector load into vzload when not all bits are needed.

53033

SDValue In = N->getOperand(0);

53034

MVT InVT = In.getSimpleValueType();

53035

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53036

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53037

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53037, __extension__
__PRETTY_FUNCTION__));

53038

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

53039

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53040

MVT MemVT = MVT::getIntegerVT(NumBits);

53041

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53042

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53043

SDLoc dl(N);

53044

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

53045

DAG.getBitcast(InVT, VZLoad));

53046

DCI.CombineTo(N, Convert);

53047

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53048

DCI.recursivelyDeleteUnusedNodes(LN);

53049

return SDValue(N, 0);

53050

}

53051

}

53052

53053

return SDValue();

53054

}

53055

53056

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

53057

TargetLowering::DAGCombinerInfo &DCI) {

53058

bool IsStrict = N->isTargetStrictFPOpcode();

53059

EVT VT = N->getValueType(0);

53060

53061

// Convert a full vector load into vzload when not all bits are needed.

53062

SDValue In = N->getOperand(IsStrict ? 1 : 0);

53063

MVT InVT = In.getSimpleValueType();

53064

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53065

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53066

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53066, __extension__
__PRETTY_FUNCTION__));

53067

LoadSDNode *LN = cast<LoadSDNode>(In);

53068

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53069

MVT MemVT = MVT::getFloatingPointVT(NumBits);

53070

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53071

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53072

SDLoc dl(N);

53073

if (IsStrict) {

53074

SDValue Convert =

53075

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

53076

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

53077

DCI.CombineTo(N, Convert, Convert.getValue(1));

53078

} else {

53079

SDValue Convert =

53080

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

53081

DCI.CombineTo(N, Convert);

53082

}

53083

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53084

DCI.recursivelyDeleteUnusedNodes(LN);

53085

return SDValue(N, 0);

53086

}

53087

}

53088

53089

return SDValue();

53090

}

53091

53092

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

53093

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

53094

TargetLowering::DAGCombinerInfo &DCI,

53095

const X86Subtarget &Subtarget) {

53096

SDValue N0 = N->getOperand(0);

53097

SDValue N1 = N->getOperand(1);

53098

MVT VT = N->getSimpleValueType(0);

53099

int NumElts = VT.getVectorNumElements();

53100

unsigned EltSizeInBits = VT.getScalarSizeInBits();

53101

53102

// ANDNP(undef, x) -> 0

53103

// ANDNP(x, undef) -> 0

53104

if (N0.isUndef() || N1.isUndef())

53105

return DAG.getConstant(0, SDLoc(N), VT);

53106

53107

// ANDNP(0, x) -> x

53108

if (ISD::isBuildVectorAllZeros(N0.getNode()))

53109

return N1;

53110

53111

// ANDNP(x, 0) -> 0

53112

if (ISD::isBuildVectorAllZeros(N1.getNode()))

53113

return DAG.getConstant(0, SDLoc(N), VT);

53114

53115

// Turn ANDNP back to AND if input is inverted.

53116

if (SDValue Not = IsNOT(N0, DAG))

53117

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);

53118

53119

// Constant Folding

53120

APInt Undefs0, Undefs1;

53121

SmallVector<APInt> EltBits0, EltBits1;

53122

if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {

53123

SDLoc DL(N);

53124

APInt ResultUndefs = APInt::getZero(NumElts);

53125

53126

if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {

53127

SmallVector<APInt> ResultBits;

53128

for (int I = 0; I != NumElts; ++I)

53129

ResultBits.push_back(~EltBits0[I] & EltBits1[I]);

53130

return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);

53131

}

53132

53133

// Constant fold NOT(N0) to allow us to use AND.

53134

// Ensure this is only performed if we can confirm that the bitcasted source

53135

// has oneuse to prevent an infinite loop with canonicalizeBitSelect.

53136

if (N0->hasOneUse()) {

53137

SDValue BC0 = peekThroughOneUseBitcasts(N0);

53138

if (BC0.getOpcode() != ISD::BITCAST) {

53139

for (APInt &Elt : EltBits0)

53140

Elt = ~Elt;

53141

SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);

53142

return DAG.getNode(ISD::AND, DL, VT, Not, N1);

53143

}

53144

}

53145

}

53146

53147

// Attempt to recursively combine a bitmask ANDNP with shuffles.

53148

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

53149

SDValue Op(N, 0);

53150

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

53151

return Res;

53152

53153

// If either operand is a constant mask, then only the elements that aren't

53154

// zero are actually demanded by the other operand.

53155

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

53156

APInt UndefElts;

53157

SmallVector<APInt> EltBits;

53158

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

53159

APInt DemandedElts = APInt::getAllOnes(NumElts);

53160

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

53161

EltBits)) {

53162

DemandedBits.clearAllBits();

53163

DemandedElts.clearAllBits();

53164

for (int I = 0; I != NumElts; ++I) {

53165

if (UndefElts[I]) {

53166

// We can't assume an undef src element gives an undef dst - the

53167

// other src might be zero.

53168

DemandedBits.setAllBits();

53169

DemandedElts.setBit(I);

53170

} else if ((Invert && !EltBits[I].isAllOnes()) ||

53171

(!Invert && !EltBits[I].isZero())) {

53172

DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

53173

DemandedElts.setBit(I);

53174

}

53175

}

53176

}

53177

return std::make_pair(DemandedBits, DemandedElts);

53178

};

53179

APInt Bits0, Elts0;

53180

APInt Bits1, Elts1;

53181

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

53182

std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);

53183

53184

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53185

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

53186

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

53187

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

53188

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

53189

if (N->getOpcode() != ISD::DELETED_NODE)

53190

DCI.AddToWorklist(N);

53191

return SDValue(N, 0);

53192

}

53193

}

53194

53195

return SDValue();

53196

}

53197

53198

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

53199

TargetLowering::DAGCombinerInfo &DCI) {

53200

SDValue N1 = N->getOperand(1);

53201

53202

// BT ignores high bits in the bit index operand.

53203

unsigned BitWidth = N1.getValueSizeInBits();

53204

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

53205

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

53206

if (N->getOpcode() != ISD::DELETED_NODE)

53207

DCI.AddToWorklist(N);

53208

return SDValue(N, 0);

53209

}

53210

53211

return SDValue();

53212

}

53213

53214

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

53215

TargetLowering::DAGCombinerInfo &DCI) {

53216

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

53217

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

53218

53219

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

53220

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53221

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

53222

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

53223

if (N->getOpcode() != ISD::DELETED_NODE)

53224

DCI.AddToWorklist(N);

53225

return SDValue(N, 0);

53226

}

53227

53228

// Convert a full vector load into vzload when not all bits are needed.

53229

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

53230

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

53231

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

53232

SDLoc dl(N);

53233

if (IsStrict) {

53234

SDValue Convert = DAG.getNode(

53235

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

53236

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

53237

DCI.CombineTo(N, Convert, Convert.getValue(1));

53238

} else {

53239

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

53240

DAG.getBitcast(MVT::v8i16, VZLoad));

53241

DCI.CombineTo(N, Convert);

53242

}

53243

53244

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53245

DCI.recursivelyDeleteUnusedNodes(LN);

53246

return SDValue(N, 0);

53247

}

53248

}

53249

}

53250

53251

return SDValue();

53252

}

53253

53254

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

53255

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

53256

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53256, __extension__
__PRETTY_FUNCTION__));

53257

53258

EVT DstVT = N->getValueType(0);

53259

53260

SDValue N0 = N->getOperand(0);

53261

SDValue N1 = N->getOperand(1);

53262

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

53263

53264

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

53265

return SDValue();

53266

53267

// Look through single use any_extends / truncs.

53268

SDValue IntermediateBitwidthOp;

53269

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

53270

N0.hasOneUse()) {

53271

IntermediateBitwidthOp = N0;

53272

N0 = N0.getOperand(0);

53273

}

53274

53275

// See if we have a single use cmov.

53276

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

53277

return SDValue();

53278

53279

SDValue CMovOp0 = N0.getOperand(0);

53280

SDValue CMovOp1 = N0.getOperand(1);

53281

53282

// Make sure both operands are constants.

53283

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

53284

!isa<ConstantSDNode>(CMovOp1.getNode()))

53285

return SDValue();

53286

53287

SDLoc DL(N);

53288

53289

// If we looked through an any_extend/trunc above, add one to the constants.

53290

if (IntermediateBitwidthOp) {

53291

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

53292

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

53293

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

53294

}

53295

53296

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

53297

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

53298

53299

EVT CMovVT = DstVT;

53300

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

53301

if (DstVT == MVT::i16) {

53302

CMovVT = MVT::i32;

53303

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

53304

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

53305

}

53306

53307

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

53308

N0.getOperand(2), N0.getOperand(3));

53309

53310

if (CMovVT != DstVT)

53311

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

53312

53313

return CMov;

53314

}

53315

53316

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

53317

const X86Subtarget &Subtarget) {

53318

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53318, __extension__
__PRETTY_FUNCTION__));

53319

53320

if (SDValue V = combineSextInRegCmov(N, DAG))

53321

return V;

53322

53323

EVT VT = N->getValueType(0);

53324

SDValue N0 = N->getOperand(0);

53325

SDValue N1 = N->getOperand(1);

53326

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

53327

SDLoc dl(N);

53328

53329

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

53330

// both SSE and AVX2 since there is no sign-extended shift right

53331

// operation on a vector with 64-bit elements.

53332

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

53333

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

53334

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

53335

N0.getOpcode() == ISD::SIGN_EXTEND)) {

53336

SDValue N00 = N0.getOperand(0);

53337

53338

// EXTLOAD has a better solution on AVX2,

53339

// it may be replaced with X86ISD::VSEXT node.

53340

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

53341

if (!ISD::isNormalLoad(N00.getNode()))

53342

return SDValue();

53343

53344

// Attempt to promote any comparison mask ops before moving the

53345

// SIGN_EXTEND_INREG in the way.

53346

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

53347

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

53348

53349

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

53350

SDValue Tmp =

53351

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

53352

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

53353

}

53354

}

53355

return SDValue();

53356

}

53357

53358

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

53359

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

53360

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

53361

/// opportunities to combine math ops, use an LEA, or use a complex addressing

53362

/// mode. This can eliminate extend, add, and shift instructions.

53363

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

53364

const X86Subtarget &Subtarget) {

53365

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

53366

Ext->getOpcode() != ISD::ZERO_EXTEND)

53367

return SDValue();

53368

53369

// TODO: This should be valid for other integer types.

53370

EVT VT = Ext->getValueType(0);

53371

if (VT != MVT::i64)

53372

return SDValue();

53373

53374

SDValue Add = Ext->getOperand(0);

53375

if (Add.getOpcode() != ISD::ADD)

53376

return SDValue();

53377

53378

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

53379

bool NSW = Add->getFlags().hasNoSignedWrap();

53380

bool NUW = Add->getFlags().hasNoUnsignedWrap();

53381

53382

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

53383

// into the 'zext'

53384

if ((Sext && !NSW) || (!Sext && !NUW))

53385

return SDValue();

53386

53387

// Having a constant operand to the 'add' ensures that we are not increasing

53388

// the instruction count because the constant is extended for free below.

53389

// A constant operand can also become the displacement field of an LEA.

53390

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

53391

if (!AddOp1)

53392

return SDValue();

53393

53394

// Don't make the 'add' bigger if there's no hope of combining it with some

53395

// other 'add' or 'shl' instruction.

53396

// TODO: It may be profitable to generate simpler LEA instructions in place

53397

// of single 'add' instructions, but the cost model for selecting an LEA

53398

// currently has a high threshold.

53399

bool HasLEAPotential = false;

53400

for (auto *User : Ext->uses()) {

53401

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

53402

HasLEAPotential = true;

53403

break;

53404

}

53405

}

53406

if (!HasLEAPotential)

53407

return SDValue();

53408

53409

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

53410

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

53411

SDValue AddOp0 = Add.getOperand(0);

53412

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

53413

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

53414

53415

// The wider add is guaranteed to not wrap because both operands are

53416

// sign-extended.

53417

SDNodeFlags Flags;

53418

Flags.setNoSignedWrap(NSW);

53419

Flags.setNoUnsignedWrap(NUW);

53420

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

53421

}

53422

53423

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

53424

// operands and the result of CMOV is not used anywhere else - promote CMOV

53425

// itself instead of promoting its result. This could be beneficial, because:

53426

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

53427

// (or more) pseudo-CMOVs only when they go one-after-another and

53428

// getting rid of result extension code after CMOV will help that.

53429

// 2) Promotion of constant CMOV arguments is free, hence the

53430

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

53431

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

53432

// promotion is also good in terms of code-size.

53433

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

53434

// promotion).

53435

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

53436

SDValue CMovN = Extend->getOperand(0);

53437

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

53438

return SDValue();

53439

53440

EVT TargetVT = Extend->getValueType(0);

53441

unsigned ExtendOpcode = Extend->getOpcode();

53442

SDLoc DL(Extend);

53443

53444

EVT VT = CMovN.getValueType();

53445

SDValue CMovOp0 = CMovN.getOperand(0);

53446

SDValue CMovOp1 = CMovN.getOperand(1);

53447

53448

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

53449

!isa<ConstantSDNode>(CMovOp1.getNode()))

53450

return SDValue();

53451

53452

// Only extend to i32 or i64.

53453

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

53454

return SDValue();

53455

53456

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

53457

// are free.

53458

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

53459

return SDValue();

53460

53461

// If this a zero extend to i64, we should only extend to i32 and use a free

53462

// zero extend to finish.

53463

EVT ExtendVT = TargetVT;

53464

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

53465

ExtendVT = MVT::i32;

53466

53467

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

53468

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

53469

53470

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

53471

CMovN.getOperand(2), CMovN.getOperand(3));

53472

53473

// Finish extending if needed.

53474

if (ExtendVT != TargetVT)

53475

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

53476

53477

return Res;

53478

}

53479

53480

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

53481

// result type.

53482

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

53483

const X86Subtarget &Subtarget) {

53484

SDValue N0 = N->getOperand(0);

53485

EVT VT = N->getValueType(0);

53486

SDLoc dl(N);

53487

53488

// Only do this combine with AVX512 for vector extends.

53489

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

53490

return SDValue();

53491

53492

// Only combine legal element types.

53493

EVT SVT = VT.getVectorElementType();

53494

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

53495

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

53496

return SDValue();

53497

53498

// We don't have CMPP Instruction for vxf16

53499

if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

53500

return SDValue();

53501

// We can only do this if the vector size in 256 bits or less.

53502

unsigned Size = VT.getSizeInBits();

53503

if (Size > 256 && Subtarget.useAVX512Regs())

53504

return SDValue();

53505

53506

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

53507

// that's the only integer compares with we have.

53508

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

53509

if (ISD::isUnsignedIntSetCC(CC))

53510

return SDValue();

53511

53512

// Only do this combine if the extension will be fully consumed by the setcc.

53513

EVT N00VT = N0.getOperand(0).getValueType();

53514

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

53515

if (Size != MatchingVecType.getSizeInBits())

53516

return SDValue();

53517

53518

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

53519

53520

if (N->getOpcode() == ISD::ZERO_EXTEND)

53521

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

53522

53523

return Res;

53524

}

53525

53526

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

53527

TargetLowering::DAGCombinerInfo &DCI,

53528

const X86Subtarget &Subtarget) {

53529

SDValue N0 = N->getOperand(0);

53530

EVT VT = N->getValueType(0);

53531

SDLoc DL(N);

53532

53533

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

53534

if (!DCI.isBeforeLegalizeOps() &&

53535

N0.getOpcode() == X86ISD::SETCC_CARRY) {

53536

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

53537

N0->getOperand(1));

53538

bool ReplaceOtherUses = !N0.hasOneUse();

53539

DCI.CombineTo(N, Setcc);

53540

// Replace other uses with a truncate of the widened setcc_carry.

53541

if (ReplaceOtherUses) {

53542

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

53543

N0.getValueType(), Setcc);

53544

DCI.CombineTo(N0.getNode(), Trunc);

53545

}

53546

53547

return SDValue(N, 0);

53548

}

53549

53550

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

53551

return NewCMov;

53552

53553

if (!DCI.isBeforeLegalizeOps())

53554

return SDValue();

53555

53556

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

53557

return V;

53558

53559

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

53560

DAG, DCI, Subtarget))

53561

return V;

53562

53563

if (VT.isVector()) {

53564

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

53565

return R;

53566

53567

if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

53568

return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

53569

}

53570

53571

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

53572

return NewAdd;

53573

53574

return SDValue();

53575

}

53576

53577

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

53578

TargetLowering::DAGCombinerInfo &DCI,

53579

const X86Subtarget &Subtarget) {

53580

SDLoc dl(N);

53581

EVT VT = N->getValueType(0);

53582

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

53583

53584

// Let legalize expand this if it isn't a legal type yet.

53585

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53586

if (!TLI.isTypeLegal(VT))

53587

return SDValue();

53588

53589

SDValue A = N->getOperand(IsStrict ? 1 : 0);

53590

SDValue B = N->getOperand(IsStrict ? 2 : 1);

53591

SDValue C = N->getOperand(IsStrict ? 3 : 2);

53592

53593

// If the operation allows fast-math and the target does not support FMA,

53594

// split this into mul+add to avoid libcall(s).

53595

SDNodeFlags Flags = N->getFlags();

53596

if (!IsStrict && Flags.hasAllowReassociation() &&

53597

TLI.isOperationExpand(ISD::FMA, VT)) {

53598

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

53599

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

53600

}

53601

53602

EVT ScalarVT = VT.getScalarType();

53603

if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

53604

!Subtarget.hasAnyFMA()) &&

53605

!(ScalarVT == MVT::f16 && Subtarget.hasFP16()))

53606

return SDValue();

53607

53608

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

53609

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53610

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53611

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

53612

CodeSize)) {

53613

V = NegV;

53614

return true;

53615

}

53616

// Look through extract_vector_elts. If it comes from an FNEG, create a

53617

// new extract from the FNEG input.

53618

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

53619

isNullConstant(V.getOperand(1))) {

53620

SDValue Vec = V.getOperand(0);

53621

if (SDValue NegV = TLI.getCheaperNegatedExpression(

53622

Vec, DAG, LegalOperations, CodeSize)) {

53623

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

53624

NegV, V.getOperand(1));

53625

return true;

53626

}

53627

}

53628

53629

return false;

53630

};

53631

53632

// Do not convert the passthru input of scalar intrinsics.

53633

// FIXME: We could allow negations of the lower element only.

53634

bool NegA = invertIfNegative(A);

53635

bool NegB = invertIfNegative(B);

53636

bool NegC = invertIfNegative(C);

53637

53638

if (!NegA && !NegB && !NegC)

53639

return SDValue();

53640

53641

unsigned NewOpcode =

53642

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

53643

53644

// Propagate fast-math-flags to new FMA node.

53645

SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

53646

if (IsStrict) {

53647

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53647, __extension__
__PRETTY_FUNCTION__));

53648

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

53649

{N->getOperand(0), A, B, C});

53650

} else {

53651

if (N->getNumOperands() == 4)

53652

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

53653

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

53654

}

53655

}

53656

53657

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

53658

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

53659

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

53660

TargetLowering::DAGCombinerInfo &DCI) {

53661

SDLoc dl(N);

53662

EVT VT = N->getValueType(0);

53663

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53664

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53665

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53666

53667

SDValue N2 = N->getOperand(2);

53668

53669

SDValue NegN2 =

53670

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

53671

if (!NegN2)

53672

return SDValue();

53673

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

53674

53675

if (N->getNumOperands() == 4)

53676

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

53677

NegN2, N->getOperand(3));

53678

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

53679

NegN2);

53680

}

53681

53682

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

53683

TargetLowering::DAGCombinerInfo &DCI,

53684

const X86Subtarget &Subtarget) {

53685

SDLoc dl(N);

53686

SDValue N0 = N->getOperand(0);

53687

EVT VT = N->getValueType(0);

53688

53689

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

53690

// FIXME: Is this needed? We don't seem to have any tests for it.

53691

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

53692

N0.getOpcode() == X86ISD::SETCC_CARRY) {

53693

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

53694

N0->getOperand(1));

53695

bool ReplaceOtherUses = !N0.hasOneUse();

53696

DCI.CombineTo(N, Setcc);

53697

// Replace other uses with a truncate of the widened setcc_carry.

53698

if (ReplaceOtherUses) {

53699

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

53700

N0.getValueType(), Setcc);

53701

DCI.CombineTo(N0.getNode(), Trunc);

53702

}

53703

53704

return SDValue(N, 0);

53705

}

53706

53707

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

53708

return NewCMov;

53709

53710

if (DCI.isBeforeLegalizeOps())

53711

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

53712

return V;

53713

53714

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

53715

DAG, DCI, Subtarget))

53716

return V;

53717

53718

if (VT.isVector())

53719

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

53720

return R;

53721

53722

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

53723

return NewAdd;

53724

53725

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

53726

return R;

53727

53728

// TODO: Combine with any target/faux shuffle.

53729

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

53730

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

53731

SDValue N00 = N0.getOperand(0);

53732

SDValue N01 = N0.getOperand(1);

53733

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

53734

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

53735

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

53736

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

53737

return concatSubVectors(N00, N01, DAG, dl);

53738

}

53739

}

53740

53741

return SDValue();

53742

}

53743

53744

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

53745

/// recognizable memcmp expansion.

53746

static bool isOrXorXorTree(SDValue X, bool Root = true) {

53747

if (X.getOpcode() == ISD::OR)

53748

return isOrXorXorTree(X.getOperand(0), false) &&

53749

isOrXorXorTree(X.getOperand(1), false);

53750

if (Root)

53751

return false;

53752

return X.getOpcode() == ISD::XOR;

53753

}

53754

53755

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

53756

/// expansion.

53757

template <typename F>

53758

static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,

53759

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

53760

SDValue Op0 = X.getOperand(0);

53761

SDValue Op1 = X.getOperand(1);

53762

if (X.getOpcode() == ISD::OR) {

53763

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

53764

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

53765

if (VecVT != CmpVT)

53766

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

53767

if (HasPT)

53768

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

53769

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

53770

}

53771

if (X.getOpcode() == ISD::XOR) {

53772

SDValue A = SToV(Op0);

53773

SDValue B = SToV(Op1);

53774

if (VecVT != CmpVT)

53775

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

53776

if (HasPT)

53777

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

53778

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

53779

}

53780

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53780);

53781

}

53782

53783

/// Try to map a 128-bit or larger integer comparison to vector instructions

53784

/// before type legalization splits it up into chunks.

53785

static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

53786

const X86Subtarget &Subtarget) {

53787

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();

53788

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53788, __extension__
__PRETTY_FUNCTION__));

53789

53790

// We're looking for an oversized integer equality comparison.

53791

SDValue X = SetCC->getOperand(0);

53792

SDValue Y = SetCC->getOperand(1);

53793

EVT OpVT = X.getValueType();

53794

unsigned OpSize = OpVT.getSizeInBits();

53795

if (!OpVT.isScalarInteger() || OpSize < 128)

53796

return SDValue();

53797

53798

// Ignore a comparison with zero because that gets special treatment in

53799

// EmitTest(). But make an exception for the special case of a pair of

53800

// logically-combined vector-sized operands compared to zero. This pattern may

53801

// be generated by the memcmp expansion pass with oversized integer compares

53802

// (see PR33325).

53803

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

53804

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

53805

return SDValue();

53806

53807

// Don't perform this combine if constructing the vector will be expensive.

53808

auto IsVectorBitCastCheap = [](SDValue X) {

53809

X = peekThroughBitcasts(X);

53810

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

53811

X.getOpcode() == ISD::LOAD;

53812

};

53813

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

53814

!IsOrXorXorTreeCCZero)

53815

return SDValue();

53816

53817

EVT VT = SetCC->getValueType(0);

53818

SDLoc DL(SetCC);

53819

53820

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

53821

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

53822

// Otherwise use PCMPEQ (plus AND) and mask testing.

53823

bool NoImplicitFloatOps =

53824

DAG.getMachineFunction().getFunction().hasFnAttribute(

53825

Attribute::NoImplicitFloat);

53826

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

53827

((OpSize == 128 && Subtarget.hasSSE2()) ||

53828

(OpSize == 256 && Subtarget.hasAVX()) ||

53829

(OpSize == 512 && Subtarget.useAVX512Regs()))) {

53830

bool HasPT = Subtarget.hasSSE41();

53831

53832

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

53833

// vector registers are essentially free. (Technically, widening registers

53834

// prevents load folding, but the tradeoff is worth it.)

53835

bool PreferKOT = Subtarget.preferMaskRegisters();

53836

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

53837

53838

EVT VecVT = MVT::v16i8;

53839

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

53840

if (OpSize == 256) {

53841

VecVT = MVT::v32i8;

53842

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

53843

}

53844

EVT CastVT = VecVT;

53845

bool NeedsAVX512FCast = false;

53846

if (OpSize == 512 || NeedZExt) {

53847

if (Subtarget.hasBWI()) {

53848

VecVT = MVT::v64i8;

53849

CmpVT = MVT::v64i1;

53850

if (OpSize == 512)

53851

CastVT = VecVT;

53852

} else {

53853

VecVT = MVT::v16i32;

53854

CmpVT = MVT::v16i1;

53855

CastVT = OpSize == 512 ? VecVT :

53856

OpSize == 256 ? MVT::v8i32 : MVT::v4i32;

53857

NeedsAVX512FCast = true;

53858

}

53859

}

53860

53861

auto ScalarToVector = [&](SDValue X) -> SDValue {

53862

bool TmpZext = false;

53863

EVT TmpCastVT = CastVT;

53864

if (X.getOpcode() == ISD::ZERO_EXTEND) {

53865

SDValue OrigX = X.getOperand(0);

53866

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

53867

if (OrigSize < OpSize) {

53868

if (OrigSize == 128) {

53869

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

53870

X = OrigX;

53871

TmpZext = true;

53872

} else if (OrigSize == 256) {

53873

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

53874

X = OrigX;

53875

TmpZext = true;

53876

}

53877

}

53878

}

53879

X = DAG.getBitcast(TmpCastVT, X);

53880

if (!NeedZExt && !TmpZext)

53881

return X;

53882

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

53883

DAG.getConstant(0, DL, VecVT), X,

53884

DAG.getVectorIdxConstant(0, DL));

53885

};

53886

53887

SDValue Cmp;

53888

if (IsOrXorXorTreeCCZero) {

53889

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

53890

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

53891

// Use 2 vector equality compares and 'and' the results before doing a

53892

// MOVMSK.

53893

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

53894

} else {

53895

SDValue VecX = ScalarToVector(X);

53896

SDValue VecY = ScalarToVector(Y);

53897

if (VecVT != CmpVT) {

53898

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

53899

} else if (HasPT) {

53900

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

53901

} else {

53902

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

53903

}

53904

}

53905

// AVX512 should emit a setcc that will lower to kortest.

53906

if (VecVT != CmpVT) {

53907

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :

53908

CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;

53909

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

53910

DAG.getConstant(0, DL, KRegVT), CC);

53911

}

53912

if (HasPT) {

53913

SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,

53914

Cmp);

53915

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

53916

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

53917

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

53918

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

53919

}

53920

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

53921

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

53922

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

53923

assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53924, __extension__
__PRETTY_FUNCTION__))

53924

"Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53924, __extension__
__PRETTY_FUNCTION__));

53925

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

53926

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

53927

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

53928

}

53929

53930

return SDValue();

53931

}

53932

53933

/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

53934

/// pre-promote its result type since vXi1 vectors don't get promoted

53935

/// during type legalization.

53936

static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,

53937

SDValue RHS, ISD::CondCode CC, SDLoc DL,

53938

SelectionDAG &DAG,

53939

const X86Subtarget &Subtarget) {

53940

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

53941

VT.getVectorElementType() == MVT::i1 &&

53942

(OpVT.getVectorElementType() == MVT::i8 ||

53943

OpVT.getVectorElementType() == MVT::i16)) {

53944

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

53945

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

53946

}

53947

return SDValue();

53948

}

53949

53950

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

53951

TargetLowering::DAGCombinerInfo &DCI,

53952

const X86Subtarget &Subtarget) {

53953

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

53954

const SDValue LHS = N->getOperand(0);

53955

const SDValue RHS = N->getOperand(1);

53956

EVT VT = N->getValueType(0);

53957

EVT OpVT = LHS.getValueType();

53958

SDLoc DL(N);

53959

53960

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

53961

if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))

53962

return V;

53963

53964

if (VT == MVT::i1 && isNullConstant(RHS)) {

53965

SDValue X86CC;

53966

if (SDValue V =

53967

MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))

53968

return DAG.getNode(ISD::TRUNCATE, DL, VT,

53969

DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));

53970

}

53971

53972

if (OpVT.isScalarInteger()) {

53973

// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

53974

// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

53975

auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

53976

if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

53977

if (N0.getOperand(0) == N1)

53978

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

53979

N0.getOperand(1));

53980

if (N0.getOperand(1) == N1)

53981

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

53982

N0.getOperand(0));

53983

}

53984

return SDValue();

53985

};

53986

if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

53987

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

53988

if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

53989

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

53990

53991

// cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

53992

// cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

53993

auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

53994

if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

53995

if (N0.getOperand(0) == N1)

53996

return DAG.getNode(ISD::AND, DL, OpVT, N1,

53997

DAG.getNOT(DL, N0.getOperand(1), OpVT));

53998

if (N0.getOperand(1) == N1)

53999

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54000

DAG.getNOT(DL, N0.getOperand(0), OpVT));

54001

}

54002

return SDValue();

54003

};

54004

if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

54005

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54006

if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

54007

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54008

54009

// cmpeq(trunc(x),0) --> cmpeq(x,0)

54010

// cmpne(trunc(x),0) --> cmpne(x,0)

54011

// iff x upper bits are zero.

54012

// TODO: Add support for RHS to be truncate as well?

54013

if (LHS.getOpcode() == ISD::TRUNCATE &&

54014

LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

54015

isNullConstant(RHS) && !DCI.isBeforeLegalize()) {

54016

EVT SrcVT = LHS.getOperand(0).getValueType();

54017

APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

54018

OpVT.getScalarSizeInBits());

54019

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54020

if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

54021

TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

54022

return DAG.getSetCC(DL, VT, LHS.getOperand(0),

54023

DAG.getConstant(0, DL, SrcVT), CC);

54024

}

54025

54026

// With C as a power of 2 and C != 0 and C != INT_MIN:

54027

// icmp eq Abs(X) C ->

54028

// (icmp eq A, C) | (icmp eq A, -C)

54029

// icmp ne Abs(X) C ->

54030

// (icmp ne A, C) & (icmp ne A, -C)

54031

// Both of these patterns can be better optimized in

54032

// DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar

54033

// integers which is checked above.

54034

if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {

54035

if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {

54036

const APInt &CInt = C->getAPIntValue();

54037

// We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.

54038

if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {

54039

SDValue BaseOp = LHS.getOperand(0);

54040

SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);

54041

SDValue SETCC1 = DAG.getSetCC(

54042

DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);

54043

return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,

54044

SETCC0, SETCC1);

54045

}

54046

}

54047

}

54048

}

54049

}

54050

54051

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

54052

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

54053

// Using temporaries to avoid messing up operand ordering for later

54054

// transformations if this doesn't work.

54055

SDValue Op0 = LHS;

54056

SDValue Op1 = RHS;

54057

ISD::CondCode TmpCC = CC;

54058

// Put build_vector on the right.

54059

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

54060

std::swap(Op0, Op1);

54061

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

54062

}

54063

54064

bool IsSEXT0 =

54065

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

54066

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

54067

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

54068

54069

if (IsSEXT0 && IsVZero1) {

54070

assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54071, __extension__
__PRETTY_FUNCTION__))

54071

"Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54071, __extension__
__PRETTY_FUNCTION__));

54072

if (TmpCC == ISD::SETGT)

54073

return DAG.getConstant(0, DL, VT);

54074

if (TmpCC == ISD::SETLE)

54075

return DAG.getConstant(1, DL, VT);

54076

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

54077

return DAG.getNOT(DL, Op0.getOperand(0), VT);

54078

54079

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54080, __extension__
__PRETTY_FUNCTION__))

54080

"Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54080, __extension__
__PRETTY_FUNCTION__));

54081

return Op0.getOperand(0);

54082

}

54083

}

54084

54085

// Try and make unsigned vector comparison signed. On pre AVX512 targets there

54086

// only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to

54087

// use `PCMPGT` if the result is mean to stay in a vector (and if its going to

54088

// a mask, there are signed AVX512 comparisons).

54089

if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {

54090

bool CanMakeSigned = false;

54091

if (ISD::isUnsignedIntSetCC(CC)) {

54092

KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),

54093

DAG.computeKnownBits(RHS));

54094

// If we know LHS/RHS share the same sign bit at each element we can

54095

// make this signed.

54096

// NOTE: `computeKnownBits` on a vector type aggregates common bits

54097

// across all lanes. So a pattern where the sign varies from lane to

54098

// lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be

54099

// missed. We could get around this by demanding each lane

54100

// independently, but this isn't the most important optimization and

54101

// that may eat into compile time.

54102

CanMakeSigned =

54103

CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();

54104

}

54105

if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {

54106

SDValue LHSOut = LHS;

54107

SDValue RHSOut = RHS;

54108

ISD::CondCode NewCC = CC;

54109

switch (CC) {

54110

case ISD::SETGE:

54111

case ISD::SETUGE:

54112

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,

54113

/*NSW*/ true))

54114

LHSOut = NewLHS;

54115

else if (SDValue NewRHS = incDecVectorConstant(

54116

RHS, DAG, /*IsInc*/ false, /*NSW*/ true))

54117

RHSOut = NewRHS;

54118

else

54119

break;

54120

54121

[[fallthrough]];

54122

case ISD::SETUGT:

54123

NewCC = ISD::SETGT;

54124

break;

54125

54126

case ISD::SETLE:

54127

case ISD::SETULE:

54128

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,

54129

/*NSW*/ true))

54130

LHSOut = NewLHS;

54131

else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,

54132

/*NSW*/ true))

54133

RHSOut = NewRHS;

54134

else

54135

break;

54136

54137

[[fallthrough]];

54138

case ISD::SETULT:

54139

// Will be swapped to SETGT in LowerVSETCC*.

54140

NewCC = ISD::SETLT;

54141

break;

54142

default:

54143

break;

54144

}

54145

if (NewCC != CC) {

54146

if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,

54147

NewCC, DL, DAG, Subtarget))

54148

return R;

54149

return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);

54150

}

54151

}

54152

}

54153

54154

if (SDValue R =

54155

truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))

54156

return R;

54157

54158

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

54159

// to avoid scalarization via legalization because v4i32 is not a legal type.

54160

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

54161

LHS.getValueType() == MVT::v4f32)

54162

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

54163

54164

// X pred 0.0 --> X pred -X

54165

// If the negation of X already exists, use it in the comparison. This removes

54166

// the need to materialize 0.0 and allows matching to SSE's MIN/MAX

54167

// instructions in patterns with a 'select' node.

54168

if (isNullFPScalarOrVectorConst(RHS)) {

54169

SDVTList FNegVT = DAG.getVTList(OpVT);

54170

if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

54171

return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

54172

}

54173

54174

return SDValue();

54175

}

54176

54177

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

54178

TargetLowering::DAGCombinerInfo &DCI,

54179

const X86Subtarget &Subtarget) {

54180

SDValue Src = N->getOperand(0);

54181

MVT SrcVT = Src.getSimpleValueType();

54182

MVT VT = N->getSimpleValueType(0);

54183

unsigned NumBits = VT.getScalarSizeInBits();

54184

unsigned NumElts = SrcVT.getVectorNumElements();

54185

unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();

54186

assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54186, __extension__
__PRETTY_FUNCTION__));

54187

54188

// Perform constant folding.

54189

APInt UndefElts;

54190

SmallVector<APInt, 32> EltBits;

54191

if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {

54192

APInt Imm(32, 0);

54193

for (unsigned Idx = 0; Idx != NumElts; ++Idx)

54194

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54195

Imm.setBit(Idx);

54196

54197

return DAG.getConstant(Imm, SDLoc(N), VT);

54198

}

54199

54200

// Look through int->fp bitcasts that don't change the element width.

54201

unsigned EltWidth = SrcVT.getScalarSizeInBits();

54202

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

54203

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

54204

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

54205

54206

// Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

54207

// with scalar comparisons.

54208

if (SDValue NotSrc = IsNOT(Src, DAG)) {

54209

SDLoc DL(N);

54210

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54211

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

54212

return DAG.getNode(ISD::XOR, DL, VT,

54213

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

54214

DAG.getConstant(NotMask, DL, VT));

54215

}

54216

54217

// Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

54218

// results with scalar comparisons.

54219

if (Src.getOpcode() == X86ISD::PCMPGT &&

54220

ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

54221

SDLoc DL(N);

54222

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54223

return DAG.getNode(ISD::XOR, DL, VT,

54224

DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

54225

DAG.getConstant(NotMask, DL, VT));

54226

}

54227

54228

// Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

54229

// iff pow2splat(c1).

54230

if (Src.getOpcode() == X86ISD::PCMPEQ &&

54231

Src.getOperand(0).getOpcode() == ISD::AND &&

54232

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

54233

SDValue LHS = Src.getOperand(0).getOperand(0);

54234

SDValue RHS = Src.getOperand(0).getOperand(1);

54235

KnownBits KnownRHS = DAG.computeKnownBits(RHS);

54236

if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {

54237

SDLoc DL(N);

54238

MVT ShiftVT = SrcVT;

54239

if (ShiftVT.getScalarType() == MVT::i8) {

54240

// vXi8 shifts - we only care about the signbit so can use PSLLW.

54241

ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

54242

LHS = DAG.getBitcast(ShiftVT, LHS);

54243

}

54244

unsigned ShiftAmt = KnownRHS.getConstant().countl_zero();

54245

LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,

54246

ShiftAmt, DAG);

54247

LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);

54248

return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);

54249

}

54250

}

54251

54252

// Simplify the inputs.

54253

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54254

APInt DemandedMask(APInt::getAllOnes(NumBits));

54255

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54256

return SDValue(N, 0);

54257

54258

return SDValue();

54259

}

54260

54261

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

54262

TargetLowering::DAGCombinerInfo &DCI,

54263

const X86Subtarget &Subtarget) {

54264

auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

54265

SDValue BasePtr = MemOp->getBasePtr();

54266

SDValue Index = MemOp->getIndex();

54267

SDValue Scale = MemOp->getScale();

54268

SDValue Mask = MemOp->getMask();

54269

54270

// Attempt to fold an index scale into the scale value directly.

54271

// For smaller indices, implicit sext is performed BEFORE scale, preventing

54272

// this fold under most circumstances.

54273

// TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?

54274

if ((Index.getOpcode() == X86ISD::VSHLI ||

54275

(Index.getOpcode() == ISD::ADD &&

54276

Index.getOperand(0) == Index.getOperand(1))) &&

54277

isa<ConstantSDNode>(Scale) &&

54278

BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {

54279

unsigned ShiftAmt =

54280

Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);

54281

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

54282

uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);

54283

if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {

54284

SDValue NewIndex = Index.getOperand(0);

54285

SDValue NewScale =

54286

DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());

54287

if (N->getOpcode() == X86ISD::MGATHER)

54288

return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,

54289

MemOp->getOperand(1), Mask,

54290

MemOp->getBasePtr(), NewIndex, NewScale,

54291

MemOp->getChain(), Subtarget);

54292

if (N->getOpcode() == X86ISD::MSCATTER)

54293

return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,

54294

MemOp->getOperand(1), Mask, MemOp->getBasePtr(),

54295

NewIndex, NewScale, MemOp->getChain(), Subtarget);

54296

}

54297

}

54298

54299

// With vector masks we only demand the upper bit of the mask.

54300

if (Mask.getScalarValueSizeInBits() != 1) {

54301

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54302

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

54303

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

54304

if (N->getOpcode() != ISD::DELETED_NODE)

54305

DCI.AddToWorklist(N);

54306

return SDValue(N, 0);

54307

}

54308

}

54309

54310

return SDValue();

54311

}

54312

54313

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

54314

SDValue Index, SDValue Base, SDValue Scale,

54315

SelectionDAG &DAG) {

54316

SDLoc DL(GorS);

54317

54318

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

54319

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

54320

Gather->getMask(), Base, Index, Scale } ;

54321

return DAG.getMaskedGather(Gather->getVTList(),

54322

Gather->getMemoryVT(), DL, Ops,

54323

Gather->getMemOperand(),

54324

Gather->getIndexType(),

54325

Gather->getExtensionType());

54326

}

54327

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

54328

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

54329

Scatter->getMask(), Base, Index, Scale };

54330

return DAG.getMaskedScatter(Scatter->getVTList(),

54331

Scatter->getMemoryVT(), DL,

54332

Ops, Scatter->getMemOperand(),

54333

Scatter->getIndexType(),

54334

Scatter->isTruncatingStore());

54335

}

54336

54337

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

54338

TargetLowering::DAGCombinerInfo &DCI) {

54339

SDLoc DL(N);

54340

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

54341

SDValue Index = GorS->getIndex();

54342

SDValue Base = GorS->getBasePtr();

54343

SDValue Scale = GorS->getScale();

54344

54345

if (DCI.isBeforeLegalize()) {

54346

unsigned IndexWidth = Index.getScalarValueSizeInBits();

54347

54348

// Shrink constant indices if they are larger than 32-bits.

54349

// Only do this before legalize types since v2i64 could become v2i32.

54350

// FIXME: We could check that the type is legal if we're after legalize

54351

// types, but then we would need to construct test cases where that happens.

54352

// FIXME: We could support more than just constant vectors, but we need to

54353

// careful with costing. A truncate that can be optimized out would be fine.

54354

// Otherwise we might only want to create a truncate if it avoids a split.

54355

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

54356

if (BV->isConstant() && IndexWidth > 32 &&

54357

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

54358

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

54359

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

54360

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54361

}

54362

}

54363

54364

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

54365

// there are sufficient sign bits. Only do this before legalize types to

54366

// avoid creating illegal types in truncate.

54367

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

54368

Index.getOpcode() == ISD::ZERO_EXTEND) &&

54369

IndexWidth > 32 &&

54370

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

54371

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

54372

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

54373

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

54374

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54375

}

54376

}

54377

54378

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54379

EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

54380

// Try to move splat constant adders from the index operand to the base

54381

// pointer operand. Taking care to multiply by the scale. We can only do

54382

// this when index element type is the same as the pointer type.

54383

// Otherwise we need to be sure the math doesn't wrap before the scale.

54384

if (Index.getOpcode() == ISD::ADD &&

54385

Index.getValueType().getVectorElementType() == PtrVT &&

54386

isa<ConstantSDNode>(Scale)) {

54387

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

54388

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {

54389

BitVector UndefElts;

54390

if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {

54391

// FIXME: Allow non-constant?

54392

if (UndefElts.none()) {

54393

// Apply the scale.

54394

APInt Adder = C->getAPIntValue() * ScaleAmt;

54395

// Add it to the existing base.

54396

Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

54397

DAG.getConstant(Adder, DL, PtrVT));

54398

Index = Index.getOperand(0);

54399

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54400

}

54401

}

54402

54403

// It's also possible base is just a constant. In that case, just

54404

// replace it with 0 and move the displacement into the index.

54405

if (BV->isConstant() && isa<ConstantSDNode>(Base) &&

54406

isOneConstant(Scale)) {

54407

SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);

54408

// Combine the constant build_vector and the constant base.

54409

Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

54410

Index.getOperand(1), Splat);

54411

// Add to the LHS of the original Index add.

54412

Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

54413

Index.getOperand(0), Splat);

54414

Base = DAG.getConstant(0, DL, Base.getValueType());

54415

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54416

}

54417

}

54418

}

54419

54420

if (DCI.isBeforeLegalizeOps()) {

54421

unsigned IndexWidth = Index.getScalarValueSizeInBits();

54422

54423

// Make sure the index is either i32 or i64

54424

if (IndexWidth != 32 && IndexWidth != 64) {

54425

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

54426

EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);

54427

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

54428

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54429

}

54430

}

54431

54432

// With vector masks we only demand the upper bit of the mask.

54433

SDValue Mask = GorS->getMask();

54434

if (Mask.getScalarValueSizeInBits() != 1) {

54435

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54436

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

54437

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

54438

if (N->getOpcode() != ISD::DELETED_NODE)

54439

DCI.AddToWorklist(N);

54440

return SDValue(N, 0);

54441

}

54442

}

54443

54444

return SDValue();

54445

}

54446

54447

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

54448

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

54449

const X86Subtarget &Subtarget) {

54450

SDLoc DL(N);

54451

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

54452

SDValue EFLAGS = N->getOperand(1);

54453

54454

// Try to simplify the EFLAGS and condition code operands.

54455

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

54456

return getSETCC(CC, Flags, DL, DAG);

54457

54458

return SDValue();

54459

}

54460

54461

/// Optimize branch condition evaluation.

54462

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

54463

const X86Subtarget &Subtarget) {

54464

SDLoc DL(N);

54465

SDValue EFLAGS = N->getOperand(3);

54466

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

54467

54468

// Try to simplify the EFLAGS and condition code operands.

54469

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

54470

// RAUW them under us.

54471

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

54472

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

54473

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

54474

N->getOperand(1), Cond, Flags);

54475

}

54476

54477

return SDValue();

54478

}

54479

54480

// TODO: Could we move this to DAGCombine?

54481

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

54482

SelectionDAG &DAG) {

54483

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

54484

// to optimize away operation when it's from a constant.

54485

//

54486

// The general transformation is:

54487

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

54488

// AND(VECTOR_CMP(x,y), constant2)

54489

// constant2 = UNARYOP(constant)

54490

54491

// Early exit if this isn't a vector operation, the operand of the

54492

// unary operation isn't a bitwise AND, or if the sizes of the operations

54493

// aren't the same.

54494

EVT VT = N->getValueType(0);

54495

bool IsStrict = N->isStrictFPOpcode();

54496

unsigned NumEltBits = VT.getScalarSizeInBits();

54497

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

54498

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

54499

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

54500

VT.getSizeInBits() != Op0.getValueSizeInBits())

54501

return SDValue();

54502

54503

// Now check that the other operand of the AND is a constant. We could

54504

// make the transformation for non-constant splats as well, but it's unclear

54505

// that would be a benefit as it would not eliminate any operations, just

54506

// perform one more step in scalar code before moving to the vector unit.

54507

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

54508

// Bail out if the vector isn't a constant.

54509

if (!BV->isConstant())

54510

return SDValue();

54511

54512

// Everything checks out. Build up the new and improved node.

54513

SDLoc DL(N);

54514

EVT IntVT = BV->getValueType(0);

54515

// Create a new constant of the appropriate type for the transformed

54516

// DAG.

54517

SDValue SourceConst;

54518

if (IsStrict)

54519

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

54520

{N->getOperand(0), SDValue(BV, 0)});

54521

else

54522

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

54523

// The AND node needs bitcasts to/from an integer vector type around it.

54524

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

54525

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

54526

MaskConst);

54527

SDValue Res = DAG.getBitcast(VT, NewAnd);

54528

if (IsStrict)

54529

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

54530

return Res;

54531

}

54532

54533

return SDValue();

54534

}

54535

54536

/// If we are converting a value to floating-point, try to replace scalar

54537

/// truncate of an extracted vector element with a bitcast. This tries to keep

54538

/// the sequence on XMM registers rather than moving between vector and GPRs.

54539

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

54540

// TODO: This is currently only used by combineSIntToFP, but it is generalized

54541

// to allow being called by any similar cast opcode.

54542

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

54543

SDValue Trunc = N->getOperand(0);

54544

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

54545

return SDValue();

54546

54547

SDValue ExtElt = Trunc.getOperand(0);

54548

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54549

!isNullConstant(ExtElt.getOperand(1)))

54550

return SDValue();

54551

54552

EVT TruncVT = Trunc.getValueType();

54553

EVT SrcVT = ExtElt.getValueType();

54554

unsigned DestWidth = TruncVT.getSizeInBits();

54555

unsigned SrcWidth = SrcVT.getSizeInBits();

54556

if (SrcWidth % DestWidth != 0)

54557

return SDValue();

54558

54559

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

54560

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

54561

unsigned VecWidth = SrcVecVT.getSizeInBits();

54562

unsigned NumElts = VecWidth / DestWidth;

54563

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

54564

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

54565

SDLoc DL(N);

54566

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

54567

BitcastVec, ExtElt.getOperand(1));

54568

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

54569

}

54570

54571

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

54572

const X86Subtarget &Subtarget) {

54573

bool IsStrict = N->isStrictFPOpcode();

54574

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

54575

EVT VT = N->getValueType(0);

54576

EVT InVT = Op0.getValueType();

54577

54578

// UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))

54579

// UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))

54580

// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))

54581

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

54582

unsigned ScalarSize = InVT.getScalarSizeInBits();

54583

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

54584

return SDValue();

54585

SDLoc dl(N);

54586

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

54587

ScalarSize < 16 ? MVT::i16

54588

: ScalarSize < 32 ? MVT::i32

54589

: MVT::i64,

54590

InVT.getVectorNumElements());

54591

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

54592

if (IsStrict)

54593

return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},

54594

{N->getOperand(0), P});

54595

return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

54596

}

54597

54598

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

54599

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

54600

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

54601

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

54602

VT.getScalarType() != MVT::f16) {

54603

SDLoc dl(N);

54604

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

54605

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

54606

54607

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

54608

if (IsStrict)

54609

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

54610

{N->getOperand(0), P});

54611

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

54612

}

54613

54614

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

54615

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

54616

// the optimization here.

54617

if (DAG.SignBitIsZero(Op0)) {

54618

if (IsStrict)

54619

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

54620

{N->getOperand(0), Op0});

54621

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

54622

}

54623

54624

return SDValue();

54625

}

54626

54627

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

54628

TargetLowering::DAGCombinerInfo &DCI,

54629

const X86Subtarget &Subtarget) {

54630

// First try to optimize away the conversion entirely when it's

54631

// conditionally from a constant. Vectors only.

54632

bool IsStrict = N->isStrictFPOpcode();

54633

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

54634

return Res;

54635

54636

// Now move on to more general possibilities.

54637

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

54638

EVT VT = N->getValueType(0);

54639

EVT InVT = Op0.getValueType();

54640

54641

// SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))

54642

// SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

54643

// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

54644

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

54645

unsigned ScalarSize = InVT.getScalarSizeInBits();

54646

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

54647

return SDValue();

54648

SDLoc dl(N);

54649

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

54650

ScalarSize < 16 ? MVT::i16

54651

: ScalarSize < 32 ? MVT::i32

54652

: MVT::i64,

54653

InVT.getVectorNumElements());

54654

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

54655

if (IsStrict)

54656

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

54657

{N->getOperand(0), P});

54658

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

54659

}

54660

54661

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

54662

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

54663

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

54664

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

54665

VT.getScalarType() != MVT::f16) {

54666

SDLoc dl(N);

54667

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

54668

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

54669

if (IsStrict)

54670

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

54671

{N->getOperand(0), P});

54672

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

54673

}

54674

54675

// Without AVX512DQ we only support i64 to float scalar conversion. For both

54676

// vectors and scalars, see if we know that the upper bits are all the sign

54677

// bit, in which case we can truncate the input to i32 and convert from that.

54678

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

54679

unsigned BitWidth = InVT.getScalarSizeInBits();

54680

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

54681

if (NumSignBits >= (BitWidth - 31)) {

54682

EVT TruncVT = MVT::i32;

54683

if (InVT.isVector())

54684

TruncVT = InVT.changeVectorElementType(TruncVT);

54685

SDLoc dl(N);

54686

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

54687

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

54688

if (IsStrict)

54689

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

54690

{N->getOperand(0), Trunc});

54691

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

54692

}

54693

// If we're after legalize and the type is v2i32 we need to shuffle and

54694

// use CVTSI2P.

54695

assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54695, __extension__
__PRETTY_FUNCTION__));

54696

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

54697

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

54698

{ 0, 2, -1, -1 });

54699

if (IsStrict)

54700

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

54701

{N->getOperand(0), Shuf});

54702

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

54703

}

54704

}

54705

54706

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

54707

// a 32-bit target where SSE doesn't support i64->FP operations.

54708

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

54709

Op0.getOpcode() == ISD::LOAD) {

54710

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

54711

54712

// This transformation is not supported if the result type is f16 or f128.

54713

if (VT == MVT::f16 || VT == MVT::f128)

54714

return SDValue();

54715

54716

// If we have AVX512DQ we can use packed conversion instructions unless

54717

// the VT is f80.

54718

if (Subtarget.hasDQI() && VT != MVT::f80)

54719

return SDValue();

54720

54721

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

54722

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

54723

std::pair<SDValue, SDValue> Tmp =

54724

Subtarget.getTargetLowering()->BuildFILD(

54725

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

54726

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

54727

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

54728

return Tmp.first;

54729

}

54730

}

54731

54732

if (IsStrict)

54733

return SDValue();

54734

54735

if (SDValue V = combineToFPTruncExtElt(N, DAG))

54736

return V;

54737

54738

return SDValue();

54739

}

54740

54741

static bool needCarryOrOverflowFlag(SDValue Flags) {

54742

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54742, __extension__
__PRETTY_FUNCTION__));

54743

54744

for (const SDNode *User : Flags->uses()) {

54745

X86::CondCode CC;

54746

switch (User->getOpcode()) {

54747

default:

54748

// Be conservative.

54749

return true;

54750

case X86ISD::SETCC:

54751

case X86ISD::SETCC_CARRY:

54752

CC = (X86::CondCode)User->getConstantOperandVal(0);

54753

break;

54754

case X86ISD::BRCOND:

54755

case X86ISD::CMOV:

54756

CC = (X86::CondCode)User->getConstantOperandVal(2);

54757

break;

54758

}

54759

54760

switch (CC) {

54761

default: break;

54762

case X86::COND_A: case X86::COND_AE:

54763

case X86::COND_B: case X86::COND_BE:

54764

case X86::COND_O: case X86::COND_NO:

54765

case X86::COND_G: case X86::COND_GE:

54766

case X86::COND_L: case X86::COND_LE:

54767

return true;

54768

}

54769

}

54770

54771

return false;

54772

}

54773

54774

static bool onlyZeroFlagUsed(SDValue Flags) {

54775

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54775, __extension__
__PRETTY_FUNCTION__));

54776

54777

for (const SDNode *User : Flags->uses()) {

54778

unsigned CCOpNo;

54779

switch (User->getOpcode()) {

54780

default:

54781

// Be conservative.

54782

return false;

54783

case X86ISD::SETCC:

54784

case X86ISD::SETCC_CARRY:

54785

CCOpNo = 0;

54786

break;

54787

case X86ISD::BRCOND:

54788

case X86ISD::CMOV:

54789

CCOpNo = 2;

54790

break;

54791

}

54792

54793

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

54794

if (CC != X86::COND_E && CC != X86::COND_NE)

54795

return false;

54796

}

54797

54798

return true;

54799

}

54800

54801

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

54802

// Only handle test patterns.

54803

if (!isNullConstant(N->getOperand(1)))

54804

return SDValue();

54805

54806

// If we have a CMP of a truncated binop, see if we can make a smaller binop

54807

// and use its flags directly.

54808

// TODO: Maybe we should try promoting compares that only use the zero flag

54809

// first if we can prove the upper bits with computeKnownBits?

54810

SDLoc dl(N);

54811

SDValue Op = N->getOperand(0);

54812

EVT VT = Op.getValueType();

54813

54814

// If we have a constant logical shift that's only used in a comparison

54815

// against zero turn it into an equivalent AND. This allows turning it into

54816

// a TEST instruction later.

54817

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

54818

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

54819

onlyZeroFlagUsed(SDValue(N, 0))) {

54820

unsigned BitWidth = VT.getSizeInBits();

54821

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

54822

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

54823

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

54824

APInt Mask = Op.getOpcode() == ISD::SRL

54825

? APInt::getHighBitsSet(BitWidth, MaskBits)

54826

: APInt::getLowBitsSet(BitWidth, MaskBits);

54827

if (Mask.isSignedIntN(32)) {

54828

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

54829

DAG.getConstant(Mask, dl, VT));

54830

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

54831

DAG.getConstant(0, dl, VT));

54832

}

54833

}

54834

}

54835

54836

// Peek through any zero-extend if we're only testing for a zero result.

54837

if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

54838

SDValue Src = Op.getOperand(0);

54839

EVT SrcVT = Src.getValueType();

54840

if (SrcVT.getScalarSizeInBits() >= 8 &&

54841

DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

54842

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

54843

DAG.getConstant(0, dl, SrcVT));

54844

}

54845

54846

// Look for a truncate.

54847

if (Op.getOpcode() != ISD::TRUNCATE)

54848

return SDValue();

54849

54850

SDValue Trunc = Op;

54851

Op = Op.getOperand(0);

54852

54853

// See if we can compare with zero against the truncation source,

54854

// which should help using the Z flag from many ops. Only do this for

54855

// i32 truncated op to prevent partial-reg compares of promoted ops.

54856

EVT OpVT = Op.getValueType();

54857

APInt UpperBits =

54858

APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

54859

if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

54860

onlyZeroFlagUsed(SDValue(N, 0))) {

54861

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

54862

DAG.getConstant(0, dl, OpVT));

54863

}

54864

54865

// After this the truncate and arithmetic op must have a single use.

54866

if (!Trunc.hasOneUse() || !Op.hasOneUse())

54867

return SDValue();

54868

54869

unsigned NewOpc;

54870

switch (Op.getOpcode()) {

54871

default: return SDValue();

54872

case ISD::AND:

54873

// Skip and with constant. We have special handling for and with immediate

54874

// during isel to generate test instructions.

54875

if (isa<ConstantSDNode>(Op.getOperand(1)))

54876

return SDValue();

54877

NewOpc = X86ISD::AND;

54878

break;

54879

case ISD::OR: NewOpc = X86ISD::OR; break;

54880

case ISD::XOR: NewOpc = X86ISD::XOR; break;

54881

case ISD::ADD:

54882

// If the carry or overflow flag is used, we can't truncate.

54883

if (needCarryOrOverflowFlag(SDValue(N, 0)))

54884

return SDValue();

54885

NewOpc = X86ISD::ADD;

54886

break;

54887

case ISD::SUB:

54888

// If the carry or overflow flag is used, we can't truncate.

54889

if (needCarryOrOverflowFlag(SDValue(N, 0)))

54890

return SDValue();

54891

NewOpc = X86ISD::SUB;

54892

break;

54893

}

54894

54895

// We found an op we can narrow. Truncate its inputs.

54896

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

54897

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

54898

54899

// Use a X86 specific opcode to avoid DAG combine messing with it.

54900

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

54901

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

54902

54903

// For AND, keep a CMP so that we can match the test pattern.

54904

if (NewOpc == X86ISD::AND)

54905

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

54906

DAG.getConstant(0, dl, VT));

54907

54908

// Return the flags.

54909

return Op.getValue(1);

54910

}

54911

54912

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

54913

TargetLowering::DAGCombinerInfo &DCI) {

54914

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54915, __extension__
__PRETTY_FUNCTION__))

54915

"Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54915, __extension__
__PRETTY_FUNCTION__));

54916

54917

SDLoc DL(N);

54918

SDValue LHS = N->getOperand(0);

54919

SDValue RHS = N->getOperand(1);

54920

MVT VT = LHS.getSimpleValueType();

54921

bool IsSub = X86ISD::SUB == N->getOpcode();

54922

unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;

54923

54924

// If we don't use the flag result, simplify back to a generic ADD/SUB.

54925

if (!N->hasAnyUseOfValue(1)) {

54926

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

54927

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

54928

}

54929

54930

// Fold any similar generic ADD/SUB opcodes to reuse this node.

54931

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

54932

SDValue Ops[] = {N0, N1};

54933

SDVTList VTs = DAG.getVTList(N->getValueType(0));

54934

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

54935

SDValue Op(N, 0);

54936

if (Negate)

54937

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

54938

DCI.CombineTo(GenericAddSub, Op);

54939

}

54940

};

54941

MatchGeneric(LHS, RHS, false);

54942

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

54943

54944

// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the

54945

// EFLAGS result doesn't change.

54946

return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,

54947

/*ZeroSecondOpOnly*/ true);

54948

}

54949

54950

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

54951

SDValue LHS = N->getOperand(0);

54952

SDValue RHS = N->getOperand(1);

54953

SDValue BorrowIn = N->getOperand(2);

54954

54955

if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

54956

MVT VT = N->getSimpleValueType(0);

54957

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

54958

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

54959

}

54960

54961

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

54962

// iff the flag result is dead.

54963

if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

54964

!N->hasAnyUseOfValue(1))

54965

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

54966

LHS.getOperand(1), BorrowIn);

54967

54968

return SDValue();

54969

}

54970

54971

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

54972

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

54973

TargetLowering::DAGCombinerInfo &DCI) {

54974

SDValue LHS = N->getOperand(0);

54975

SDValue RHS = N->getOperand(1);

54976

SDValue CarryIn = N->getOperand(2);

54977

auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

54978

auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

54979

54980

// Canonicalize constant to RHS.

54981

if (LHSC && !RHSC)

54982

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

54983

CarryIn);

54984

54985

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

54986

// the result is either zero or one (depending on the input carry bit).

54987

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

54988

if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

54989

// We don't have a good way to replace an EFLAGS use, so only do this when

54990

// dead right now.

54991

SDValue(N, 1).use_empty()) {

54992

SDLoc DL(N);

54993

EVT VT = N->getValueType(0);

54994

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

54995

SDValue Res1 = DAG.getNode(

54996

ISD::AND, DL, VT,

54997

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

54998

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

54999

DAG.getConstant(1, DL, VT));

55000

return DCI.CombineTo(N, Res1, CarryOut);

55001

}

55002

55003

// Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

55004

// iff the flag result is dead.

55005

// TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

55006

if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

55007

SDLoc DL(N);

55008

APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

55009

return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

55010

DAG.getConstant(0, DL, LHS.getValueType()),

55011

DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

55012

}

55013

55014

if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

55015

MVT VT = N->getSimpleValueType(0);

55016

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55017

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

55018

}

55019

55020

// Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

55021

// iff the flag result is dead.

55022

if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

55023

!N->hasAnyUseOfValue(1))

55024

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55025

LHS.getOperand(1), CarryIn);

55026

55027

return SDValue();

55028

}

55029

55030

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

55031

const SDLoc &DL, EVT VT,

55032

const X86Subtarget &Subtarget) {

55033

// Example of pattern we try to detect:

55034

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

55035

//(add (build_vector (extract_elt t, 0),

55036

// (extract_elt t, 2),

55037

// (extract_elt t, 4),

55038

// (extract_elt t, 6)),

55039

// (build_vector (extract_elt t, 1),

55040

// (extract_elt t, 3),

55041

// (extract_elt t, 5),

55042

// (extract_elt t, 7)))

55043

55044

if (!Subtarget.hasSSE2())

55045

return SDValue();

55046

55047

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

55048

Op1.getOpcode() != ISD::BUILD_VECTOR)

55049

return SDValue();

55050

55051

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55052

VT.getVectorNumElements() < 4 ||

55053

!isPowerOf2_32(VT.getVectorNumElements()))

55054

return SDValue();

55055

55056

// Check if one of Op0,Op1 is of the form:

55057

// (build_vector (extract_elt Mul, 0),

55058

// (extract_elt Mul, 2),

55059

// (extract_elt Mul, 4),

55060

// ...

55061

// the other is of the form:

55062

// (build_vector (extract_elt Mul, 1),

55063

// (extract_elt Mul, 3),

55064

// (extract_elt Mul, 5),

55065

// ...

55066

// and identify Mul.

55067

SDValue Mul;

55068

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

55069

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

55070

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

55071

// TODO: Be more tolerant to undefs.

55072

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55073

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55074

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55075

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55076

return SDValue();

55077

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

55078

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

55079

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

55080

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

55081

if (!Const0L || !Const1L || !Const0H || !Const1H)

55082

return SDValue();

55083

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

55084

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

55085

// Commutativity of mul allows factors of a product to reorder.

55086

if (Idx0L > Idx1L)

55087

std::swap(Idx0L, Idx1L);

55088

if (Idx0H > Idx1H)

55089

std::swap(Idx0H, Idx1H);

55090

// Commutativity of add allows pairs of factors to reorder.

55091

if (Idx0L > Idx0H) {

55092

std::swap(Idx0L, Idx0H);

55093

std::swap(Idx1L, Idx1H);

55094

}

55095

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

55096

Idx1H != 2 * i + 3)

55097

return SDValue();

55098

if (!Mul) {

55099

// First time an extract_elt's source vector is visited. Must be a MUL

55100

// with 2X number of vector elements than the BUILD_VECTOR.

55101

// Both extracts must be from same MUL.

55102

Mul = Op0L->getOperand(0);

55103

if (Mul->getOpcode() != ISD::MUL ||

55104

Mul.getValueType().getVectorNumElements() != 2 * e)

55105

return SDValue();

55106

}

55107

// Check that the extract is from the same MUL previously seen.

55108

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

55109

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

55110

return SDValue();

55111

}

55112

55113

// Check if the Mul source can be safely shrunk.

55114

ShrinkMode Mode;

55115

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

55116

Mode == ShrinkMode::MULU16)

55117

return SDValue();

55118

55119

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55120

VT.getVectorNumElements() * 2);

55121

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

55122

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

55123

55124

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55125

ArrayRef<SDValue> Ops) {

55126

EVT InVT = Ops[0].getValueType();

55127

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55127, __extension__
__PRETTY_FUNCTION__));

55128

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55129

InVT.getVectorNumElements() / 2);

55130

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55131

};

55132

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

55133

}

55134

55135

// Attempt to turn this pattern into PMADDWD.

55136

// (add (mul (sext (build_vector)), (sext (build_vector))),

55137

// (mul (sext (build_vector)), (sext (build_vector)))

55138

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

55139

const SDLoc &DL, EVT VT,

55140

const X86Subtarget &Subtarget) {

55141

if (!Subtarget.hasSSE2())

55142

return SDValue();

55143

55144

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

55145

return SDValue();

55146

55147

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55148

VT.getVectorNumElements() < 4 ||

55149

!isPowerOf2_32(VT.getVectorNumElements()))

55150

return SDValue();

55151

55152

SDValue N00 = N0.getOperand(0);

55153

SDValue N01 = N0.getOperand(1);

55154

SDValue N10 = N1.getOperand(0);

55155

SDValue N11 = N1.getOperand(1);

55156

55157

// All inputs need to be sign extends.

55158

// TODO: Support ZERO_EXTEND from known positive?

55159

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

55160

N01.getOpcode() != ISD::SIGN_EXTEND ||

55161

N10.getOpcode() != ISD::SIGN_EXTEND ||

55162

N11.getOpcode() != ISD::SIGN_EXTEND)

55163

return SDValue();

55164

55165

// Peek through the extends.

55166

N00 = N00.getOperand(0);

55167

N01 = N01.getOperand(0);

55168

N10 = N10.getOperand(0);

55169

N11 = N11.getOperand(0);

55170

55171

// Must be extending from vXi16.

55172

EVT InVT = N00.getValueType();

55173

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

55174

N10.getValueType() != InVT || N11.getValueType() != InVT)

55175

return SDValue();

55176

55177

// All inputs should be build_vectors.

55178

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

55179

N01.getOpcode() != ISD::BUILD_VECTOR ||

55180

N10.getOpcode() != ISD::BUILD_VECTOR ||

55181

N11.getOpcode() != ISD::BUILD_VECTOR)

55182

return SDValue();

55183

55184

// For each element, we need to ensure we have an odd element from one vector

55185

// multiplied by the odd element of another vector and the even element from

55186

// one of the same vectors being multiplied by the even element from the

55187

// other vector. So we need to make sure for each element i, this operator

55188

// is being performed:

55189

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

55190

SDValue In0, In1;

55191

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

55192

SDValue N00Elt = N00.getOperand(i);

55193

SDValue N01Elt = N01.getOperand(i);

55194

SDValue N10Elt = N10.getOperand(i);

55195

SDValue N11Elt = N11.getOperand(i);

55196

// TODO: Be more tolerant to undefs.

55197

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55198

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55199

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55200

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55201

return SDValue();

55202

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

55203

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

55204

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

55205

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

55206

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

55207

return SDValue();

55208

unsigned IdxN00 = ConstN00Elt->getZExtValue();

55209

unsigned IdxN01 = ConstN01Elt->getZExtValue();

55210

unsigned IdxN10 = ConstN10Elt->getZExtValue();

55211

unsigned IdxN11 = ConstN11Elt->getZExtValue();

55212

// Add is commutative so indices can be reordered.

55213

if (IdxN00 > IdxN10) {

55214

std::swap(IdxN00, IdxN10);

55215

std::swap(IdxN01, IdxN11);

55216

}

55217

// N0 indices be the even element. N1 indices must be the next odd element.

55218

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

55219

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

55220

return SDValue();

55221

SDValue N00In = N00Elt.getOperand(0);

55222

SDValue N01In = N01Elt.getOperand(0);

55223

SDValue N10In = N10Elt.getOperand(0);

55224

SDValue N11In = N11Elt.getOperand(0);

55225

55226

// First time we find an input capture it.

55227

if (!In0) {

55228

In0 = N00In;

55229

In1 = N01In;

55230

55231

// The input vectors must be at least as wide as the output.

55232

// If they are larger than the output, we extract subvector below.

55233

if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

55234

In1.getValueSizeInBits() < VT.getSizeInBits())

55235

return SDValue();

55236

}

55237

// Mul is commutative so the input vectors can be in any order.

55238

// Canonicalize to make the compares easier.

55239

if (In0 != N00In)

55240

std::swap(N00In, N01In);

55241

if (In0 != N10In)

55242

std::swap(N10In, N11In);

55243

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

55244

return SDValue();

55245

}

55246

55247

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55248

ArrayRef<SDValue> Ops) {

55249

EVT OpVT = Ops[0].getValueType();

55250

assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55251, __extension__
__PRETTY_FUNCTION__))

55251

"Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55251, __extension__
__PRETTY_FUNCTION__));

55252

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55252, __extension__
__PRETTY_FUNCTION__));

55253

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55254

OpVT.getVectorNumElements() / 2);

55255

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55256

};

55257

55258

// If the output is narrower than an input, extract the low part of the input

55259

// vector.

55260

EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55261

VT.getVectorNumElements() * 2);

55262

if (OutVT16.bitsLT(In0.getValueType())) {

55263

In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

55264

DAG.getIntPtrConstant(0, DL));

55265

}

55266

if (OutVT16.bitsLT(In1.getValueType())) {

55267

In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

55268

DAG.getIntPtrConstant(0, DL));

55269

}

55270

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

55271

PMADDBuilder);

55272

}

55273

55274

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

55275

// If upper element in each pair of both VPMADDWD are zero then we can merge

55276

// the operand elements and use the implicit add of VPMADDWD.

55277

// TODO: Add support for VPMADDUBSW (which isn't commutable).

55278

static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

55279

const SDLoc &DL, EVT VT) {

55280

if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

55281

return SDValue();

55282

55283

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

55284

if (VT.getSizeInBits() > 128)

55285

return SDValue();

55286

55287

unsigned NumElts = VT.getVectorNumElements();

55288

MVT OpVT = N0.getOperand(0).getSimpleValueType();

55289

APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

55290

APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

55291

55292

bool Op0HiZero =

55293

DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

55294

DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

55295

bool Op1HiZero =

55296

DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

55297

DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

55298

55299

// TODO: Check for zero lower elements once we have actual codegen that

55300

// creates them.

55301

if (!Op0HiZero || !Op1HiZero)

55302

return SDValue();

55303

55304

// Create a shuffle mask packing the lower elements from each VPMADDWD.

55305

SmallVector<int> Mask;

55306

for (int i = 0; i != (int)NumElts; ++i) {

55307

Mask.push_back(2 * i);

55308

Mask.push_back(2 * (i + NumElts));

55309

}

55310

55311

SDValue LHS =

55312

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

55313

SDValue RHS =

55314

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

55315

return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

55316

}

55317

55318

/// CMOV of constants requires materializing constant operands in registers.

55319

/// Try to fold those constants into an 'add' instruction to reduce instruction

55320

/// count. We do this with CMOV rather the generic 'select' because there are

55321

/// earlier folds that may be used to turn select-of-constants into logic hacks.

55322

static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,

55323

const X86Subtarget &Subtarget) {

55324

// If an operand is zero, add-of-0 gets simplified away, so that's clearly

55325

// better because we eliminate 1-2 instructions. This transform is still

55326

// an improvement without zero operands because we trade 2 move constants and

55327

// 1 add for 2 adds (LEA) as long as the constants can be represented as

55328

// immediate asm operands (fit in 32-bits).

55329

auto isSuitableCmov = [](SDValue V) {

55330

if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

55331

return false;

55332

if (!isa<ConstantSDNode>(V.getOperand(0)) ||

55333

!isa<ConstantSDNode>(V.getOperand(1)))

55334

return false;

55335

return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

55336

(V.getConstantOperandAPInt(0).isSignedIntN(32) &&

55337

V.getConstantOperandAPInt(1).isSignedIntN(32));

55338

};

55339

55340

// Match an appropriate CMOV as the first operand of the add.

55341

SDValue Cmov = N->getOperand(0);

55342

SDValue OtherOp = N->getOperand(1);

55343

if (!isSuitableCmov(Cmov))

55344

std::swap(Cmov, OtherOp);

55345

if (!isSuitableCmov(Cmov))

55346

return SDValue();

55347

55348

// Don't remove a load folding opportunity for the add. That would neutralize

55349

// any improvements from removing constant materializations.

55350

if (X86::mayFoldLoad(OtherOp, Subtarget))

55351

return SDValue();

55352

55353

EVT VT = N->getValueType(0);

55354

SDLoc DL(N);

55355

SDValue FalseOp = Cmov.getOperand(0);

55356

SDValue TrueOp = Cmov.getOperand(1);

55357

55358

// We will push the add through the select, but we can potentially do better

55359

// if we know there is another add in the sequence and this is pointer math.

55360

// In that case, we can absorb an add into the trailing memory op and avoid

55361

// a 3-operand LEA which is likely slower than a 2-operand LEA.

55362

// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

55363

if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

55364

!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

55365

all_of(N->uses(), [&](SDNode *Use) {

55366

auto *MemNode = dyn_cast<MemSDNode>(Use);

55367

return MemNode && MemNode->getBasePtr().getNode() == N;

55368

})) {

55369

// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

55370

// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

55371

// it is possible that choosing op1 might be better.

55372

SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

55373

FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

55374

TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

55375

Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

55376

Cmov.getOperand(2), Cmov.getOperand(3));

55377

return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

55378

}

55379

55380

// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

55381

FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

55382

TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

55383

return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

55384

Cmov.getOperand(3));

55385

}

55386

55387

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

55388

TargetLowering::DAGCombinerInfo &DCI,

55389

const X86Subtarget &Subtarget) {

55390

EVT VT = N->getValueType(0);

55391

SDValue Op0 = N->getOperand(0);

55392

SDValue Op1 = N->getOperand(1);

55393

SDLoc DL(N);

55394

55395

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))

55396

return Select;

55397

55398

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))

55399

return MAdd;

55400

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))

55401

return MAdd;

55402

if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

55403

return MAdd;

55404

55405

// Try to synthesize horizontal adds from adds of shuffles.

55406

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

55407

return V;

55408

55409

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

55410

// (sub Y, (sext (vXi1 X))).

55411

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

55412

// generic DAG combine without a legal type check, but adding this there

55413

// caused regressions.

55414

if (VT.isVector()) {

55415

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55416

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

55417

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

55418

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

55419

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

55420

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

55421

}

55422

55423

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

55424

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

55425

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

55426

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

55427

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

55428

}

55429

}

55430

55431

// Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

55432

if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

55433

X86::isZeroNode(Op0.getOperand(1))) {

55434

assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55434, __extension__
__PRETTY_FUNCTION__));

55435

return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

55436

Op0.getOperand(0), Op0.getOperand(2));

55437

}

55438

55439

return combineAddOrSubToADCOrSBB(N, DAG);

55440

}

55441

55442

// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

55443

// condition comes from the subtract node that produced -X. This matches the

55444

// cmov expansion for absolute value. By swapping the operands we convert abs

55445

// to nabs.

55446

static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {

55447

SDValue N0 = N->getOperand(0);

55448

SDValue N1 = N->getOperand(1);

55449

55450

if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

55451

return SDValue();

55452

55453

X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);

55454

if (CC != X86::COND_S && CC != X86::COND_NS)

55455

return SDValue();

55456

55457

// Condition should come from a negate operation.

55458

SDValue Cond = N1.getOperand(3);

55459

if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))

55460

return SDValue();

55461

assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55461, __extension__
__PRETTY_FUNCTION__));

55462

55463

// Get the X and -X from the negate.

55464

SDValue NegX = Cond.getValue(0);

55465

SDValue X = Cond.getOperand(1);

55466

55467

SDValue FalseOp = N1.getOperand(0);

55468

SDValue TrueOp = N1.getOperand(1);

55469

55470

// Cmov operands should be X and NegX. Order doesn't matter.

55471

if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

55472

return SDValue();

55473

55474

// Build a new CMOV with the operands swapped.

55475

SDLoc DL(N);

55476

MVT VT = N->getSimpleValueType(0);

55477

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

55478

N1.getOperand(2), Cond);

55479

// Convert sub to add.

55480

return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

55481

}

55482

55483

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

55484

TargetLowering::DAGCombinerInfo &DCI,

55485

const X86Subtarget &Subtarget) {

55486

SDValue Op0 = N->getOperand(0);

55487

SDValue Op1 = N->getOperand(1);

55488

55489

// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.

55490

auto IsNonOpaqueConstant = [&](SDValue Op) {

55491

if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {

55492

if (auto *Cst = dyn_cast<ConstantSDNode>(C))

55493

return !Cst->isOpaque();

55494

return true;

55495

}

55496

return false;

55497

};

55498

55499

// X86 can't encode an immediate LHS of a sub. See if we can push the

55500

// negation into a preceding instruction. If the RHS of the sub is a XOR with

55501

// one use and a constant, invert the immediate, saving one register.

55502

// sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

55503

if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

55504

IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {

55505

SDLoc DL(N);

55506

EVT VT = Op0.getValueType();

55507

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

55508

DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

55509

SDValue NewAdd =

55510

DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

55511

return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

55512

}

55513

55514

if (SDValue V = combineSubABS(N, DAG))

55515

return V;

55516

55517

// Try to synthesize horizontal subs from subs of shuffles.

55518

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

55519

return V;

55520

55521

// Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

55522

if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

55523

X86::isZeroNode(Op1.getOperand(1))) {

55524

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55524, __extension__
__PRETTY_FUNCTION__));

55525

return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

55526

Op1.getOperand(0), Op1.getOperand(2));

55527

}

55528

55529

// Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

55530

// Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

55531

if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

55532

!(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

55533

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55533, __extension__
__PRETTY_FUNCTION__));

55534

SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

55535

Op1.getOperand(1), Op1.getOperand(2));

55536

return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),

55537

Op1.getOperand(0));

55538

}

55539

55540

if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))

55541

return V;

55542

55543

return combineAddOrSubToADCOrSBB(N, DAG);

55544

}

55545

55546

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

55547

const X86Subtarget &Subtarget) {

55548

MVT VT = N->getSimpleValueType(0);

55549

SDLoc DL(N);

55550

55551

if (N->getOperand(0) == N->getOperand(1)) {

55552

if (N->getOpcode() == X86ISD::PCMPEQ)

55553

return DAG.getConstant(-1, DL, VT);

55554

if (N->getOpcode() == X86ISD::PCMPGT)

55555

return DAG.getConstant(0, DL, VT);

55556

}

55557

55558

return SDValue();

55559

}

55560

55561

/// Helper that combines an array of subvector ops as if they were the operands

55562

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

55563

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

55564

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

55565

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

55566

TargetLowering::DAGCombinerInfo &DCI,

55567

const X86Subtarget &Subtarget) {

55568

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55568, __extension__
__PRETTY_FUNCTION__));

55569

unsigned EltSizeInBits = VT.getScalarSizeInBits();

55570

55571

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

55572

return DAG.getUNDEF(VT);

55573

55574

if (llvm::all_of(Ops, [](SDValue Op) {

55575

return ISD::isBuildVectorAllZeros(Op.getNode());

55576

}))

55577

return getZeroVector(VT, Subtarget, DAG, DL);

55578

55579

SDValue Op0 = Ops[0];

55580

bool IsSplat = llvm::all_equal(Ops);

55581

55582

// Repeated subvectors.

55583

if (IsSplat &&

55584

(VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

55585

// If this broadcast is inserted into both halves, use a larger broadcast.

55586

if (Op0.getOpcode() == X86ISD::VBROADCAST)

55587

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

55588

55589

// If this simple subvector or scalar/subvector broadcast_load is inserted

55590

// into both halves, use a larger broadcast_load. Update other uses to use

55591

// an extracted subvector.

55592

if (ISD::isNormalLoad(Op0.getNode()) ||

55593

Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

55594

Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

55595

auto *Mem = cast<MemSDNode>(Op0);

55596

unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

55597

? X86ISD::VBROADCAST_LOAD

55598

: X86ISD::SUBV_BROADCAST_LOAD;

55599

if (SDValue BcastLd =

55600

getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

55601

SDValue BcastSrc =

55602

extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

55603

DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

55604

return BcastLd;

55605

}

55606

}

55607

55608

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

55609

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

55610

(Subtarget.hasAVX2() ||

55611

X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

55612

VT.getScalarType(), Subtarget)))

55613

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

55614

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

55615

Op0.getOperand(0),

55616

DAG.getIntPtrConstant(0, DL)));

55617

55618

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

55619

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

55620

(Subtarget.hasAVX2() ||

55621

(EltSizeInBits >= 32 &&

55622

X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

55623

Op0.getOperand(0).getValueType() == VT.getScalarType())

55624

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

55625

55626

// concat_vectors(extract_subvector(broadcast(x)),

55627

// extract_subvector(broadcast(x))) -> broadcast(x)

55628

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

55629

Op0.getOperand(0).getValueType() == VT) {

55630

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

55631

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

55632

return Op0.getOperand(0);

55633

}

55634

}

55635

55636

// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.

55637

// Only concat of subvector high halves which vperm2x128 is best at.

55638

// TODO: This should go in combineX86ShufflesRecursively eventually.

55639

if (VT.is256BitVector() && Ops.size() == 2) {

55640

SDValue Src0 = peekThroughBitcasts(Ops[0]);

55641

SDValue Src1 = peekThroughBitcasts(Ops[1]);

55642

if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

55643

Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

55644

EVT SrcVT0 = Src0.getOperand(0).getValueType();

55645

EVT SrcVT1 = Src1.getOperand(0).getValueType();

55646

unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

55647

unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

55648

if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&

55649

Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&

55650

Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {

55651

return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

55652

DAG.getBitcast(VT, Src0.getOperand(0)),

55653

DAG.getBitcast(VT, Src1.getOperand(0)),

55654

DAG.getTargetConstant(0x31, DL, MVT::i8));

55655

}

55656

}

55657

}

55658

55659

// Repeated opcode.

55660

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

55661

// but it currently struggles with different vector widths.

55662

if (llvm::all_of(Ops, [Op0](SDValue Op) {

55663

return Op.getOpcode() == Op0.getOpcode();

55664

})) {

55665

auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

55666

SmallVector<SDValue> Subs;

55667

for (SDValue SubOp : SubOps)

55668

Subs.push_back(SubOp.getOperand(I));

55669

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

55670

};

55671

auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {

55672

for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {

55673

SDValue Sub = SubOps[I].getOperand(Op);

55674

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

55675

if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

55676

Sub.getOperand(0).getValueType() != VT ||

55677

Sub.getConstantOperandAPInt(1) != (I * NumSubElts))

55678

return false;

55679

}

55680

return true;

55681

};

55682

55683

unsigned NumOps = Ops.size();

55684

switch (Op0.getOpcode()) {

55685

case X86ISD::VBROADCAST: {

55686

if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {

55687

return Op.getOperand(0).getValueType().is128BitVector();

55688

})) {

55689

if (VT == MVT::v4f64 || VT == MVT::v4i64)

55690

return DAG.getNode(X86ISD::UNPCKL, DL, VT,

55691

ConcatSubOperand(VT, Ops, 0),

55692

ConcatSubOperand(VT, Ops, 0));

55693

// TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.

55694

if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))

55695

return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI

55696

: X86ISD::PSHUFD,

55697

DL, VT, ConcatSubOperand(VT, Ops, 0),

55698

getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

55699

}

55700

break;

55701

}

55702

case X86ISD::MOVDDUP:

55703

case X86ISD::MOVSHDUP:

55704

case X86ISD::MOVSLDUP: {

55705

if (!IsSplat)

55706

return DAG.getNode(Op0.getOpcode(), DL, VT,

55707

ConcatSubOperand(VT, Ops, 0));

55708

break;

55709

}

55710

case X86ISD::SHUFP: {

55711

// Add SHUFPD support if/when necessary.

55712

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

55713

llvm::all_of(Ops, [Op0](SDValue Op) {

55714

return Op.getOperand(2) == Op0.getOperand(2);

55715

})) {

55716

return DAG.getNode(Op0.getOpcode(), DL, VT,

55717

ConcatSubOperand(VT, Ops, 0),

55718

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

55719

}

55720

break;

55721

}

55722

case X86ISD::PSHUFHW:

55723

case X86ISD::PSHUFLW:

55724

case X86ISD::PSHUFD:

55725

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

55726

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

55727

return DAG.getNode(Op0.getOpcode(), DL, VT,

55728

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

55729

}

55730

[[fallthrough]];

55731

case X86ISD::VPERMILPI:

55732

if (!IsSplat && VT.getScalarSizeInBits() == 32 &&

55733

(VT.is256BitVector() ||

55734

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

55735

all_of(Ops, [&Op0](SDValue Op) {

55736

return Op0.getOperand(1) == Op.getOperand(1);

55737

})) {

55738

MVT FloatVT = VT.changeVectorElementType(MVT::f32);

55739

SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));

55740

Res =

55741

DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));

55742

return DAG.getBitcast(VT, Res);

55743

}

55744

if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {

55745

uint64_t Idx0 = Ops[0].getConstantOperandVal(1);

55746

uint64_t Idx1 = Ops[1].getConstantOperandVal(1);

55747

uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);

55748

return DAG.getNode(Op0.getOpcode(), DL, VT,

55749

ConcatSubOperand(VT, Ops, 0),

55750

DAG.getTargetConstant(Idx, DL, MVT::i8));

55751

}

55752

break;

55753

case X86ISD::PSHUFB:

55754

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

55755

(VT.is512BitVector() && Subtarget.useBWIRegs()))) {

55756

return DAG.getNode(Op0.getOpcode(), DL, VT,

55757

ConcatSubOperand(VT, Ops, 0),

55758

ConcatSubOperand(VT, Ops, 1));

55759

}

55760

break;

55761

case X86ISD::VPERMV:

55762

if (!IsSplat && NumOps == 2 &&

55763

(VT.is512BitVector() && Subtarget.useAVX512Regs())) {

55764

MVT OpVT = Op0.getSimpleValueType();

55765

int NumSrcElts = OpVT.getVectorNumElements();

55766

SmallVector<int, 64> ConcatMask;

55767

for (unsigned i = 0; i != NumOps; ++i) {

55768

SmallVector<int, 64> SubMask;

55769

SmallVector<SDValue, 2> SubOps;

55770

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

55771

SubMask))

55772

break;

55773

for (int M : SubMask) {

55774

if (0 <= M)

55775

M += i * NumSrcElts;

55776

ConcatMask.push_back(M);

55777

}

55778

}

55779

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

55780

SDValue Src = concatSubVectors(Ops[0].getOperand(1),

55781

Ops[1].getOperand(1), DAG, DL);

55782

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

55783

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

55784

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

55785

return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);

55786

}

55787

}

55788

break;

55789

case X86ISD::VPERMV3:

55790

if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

55791

MVT OpVT = Op0.getSimpleValueType();

55792

int NumSrcElts = OpVT.getVectorNumElements();

55793

SmallVector<int, 64> ConcatMask;

55794

for (unsigned i = 0; i != NumOps; ++i) {

55795

SmallVector<int, 64> SubMask;

55796

SmallVector<SDValue, 2> SubOps;

55797

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

55798

SubMask))

55799

break;

55800

for (int M : SubMask) {

55801

if (0 <= M) {

55802

M += M < NumSrcElts ? 0 : NumSrcElts;

55803

M += i * NumSrcElts;

55804

}

55805

ConcatMask.push_back(M);

55806

}

55807

}

55808

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

55809

SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),

55810

Ops[1].getOperand(0), DAG, DL);

55811

SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),

55812

Ops[1].getOperand(2), DAG, DL);

55813

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

55814

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

55815

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

55816

return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);

55817

}

55818

}

55819

break;

55820

case X86ISD::VSHLI:

55821

case X86ISD::VSRLI:

55822

// Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

55823

// TODO: Move this to LowerShiftByScalarImmediate?

55824

if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

55825

llvm::all_of(Ops, [](SDValue Op) {

55826

return Op.getConstantOperandAPInt(1) == 32;

55827

})) {

55828

SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));

55829

SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

55830

if (Op0.getOpcode() == X86ISD::VSHLI) {

55831

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

55832

{8, 0, 8, 2, 8, 4, 8, 6});

55833

} else {

55834

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

55835

{1, 8, 3, 8, 5, 8, 7, 8});

55836

}

55837

return DAG.getBitcast(VT, Res);

55838

}

55839

[[fallthrough]];

55840

case X86ISD::VSRAI:

55841

case X86ISD::VSHL:

55842

case X86ISD::VSRL:

55843

case X86ISD::VSRA:

55844

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

55845

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

55846

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

55847

llvm::all_of(Ops, [Op0](SDValue Op) {

55848

return Op0.getOperand(1) == Op.getOperand(1);

55849

})) {

55850

return DAG.getNode(Op0.getOpcode(), DL, VT,

55851

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

55852

}

55853

break;

55854

case X86ISD::VPERMI:

55855

case X86ISD::VROTLI:

55856

case X86ISD::VROTRI:

55857

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

55858

llvm::all_of(Ops, [Op0](SDValue Op) {

55859

return Op0.getOperand(1) == Op.getOperand(1);

55860

})) {

55861

return DAG.getNode(Op0.getOpcode(), DL, VT,

55862

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

55863

}

55864

break;

55865

case ISD::AND:

55866

case ISD::OR:

55867

case ISD::XOR:

55868

case X86ISD::ANDNP:

55869

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

55870

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

55871

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

55872

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

55873

NumOps * SrcVT.getVectorNumElements());

55874

return DAG.getNode(Op0.getOpcode(), DL, VT,

55875

ConcatSubOperand(SrcVT, Ops, 0),

55876

ConcatSubOperand(SrcVT, Ops, 1));

55877

}

55878

break;

55879

case X86ISD::GF2P8AFFINEQB:

55880

if (!IsSplat &&

55881

(VT.is256BitVector() ||

55882

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

55883

llvm::all_of(Ops, [Op0](SDValue Op) {

55884

return Op0.getOperand(2) == Op.getOperand(2);

55885

})) {

55886

return DAG.getNode(Op0.getOpcode(), DL, VT,

55887

ConcatSubOperand(VT, Ops, 0),

55888

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

55889

}

55890

break;

55891

case ISD::FADD:

55892

case ISD::FSUB:

55893

case ISD::FMUL:

55894

case ISD::FDIV:

55895

if (!IsSplat && (VT.is256BitVector() ||

55896

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

55897

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

55898

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

55899

NumOps * SrcVT.getVectorNumElements());

55900

return DAG.getNode(Op0.getOpcode(), DL, VT,

55901

ConcatSubOperand(SrcVT, Ops, 0),

55902

ConcatSubOperand(SrcVT, Ops, 1));

55903

}

55904

break;

55905

case X86ISD::HADD:

55906

case X86ISD::HSUB:

55907

case X86ISD::FHADD:

55908

case X86ISD::FHSUB:

55909

case X86ISD::PACKSS:

55910

case X86ISD::PACKUS:

55911

if (!IsSplat && VT.is256BitVector() &&

55912

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

55913

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

55914

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

55915

NumOps * SrcVT.getVectorNumElements());

55916

return DAG.getNode(Op0.getOpcode(), DL, VT,

55917

ConcatSubOperand(SrcVT, Ops, 0),

55918

ConcatSubOperand(SrcVT, Ops, 1));

55919

}

55920

break;

55921

case X86ISD::PALIGNR:

55922

if (!IsSplat &&

55923

((VT.is256BitVector() && Subtarget.hasInt256()) ||

55924

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

55925

llvm::all_of(Ops, [Op0](SDValue Op) {

55926

return Op0.getOperand(2) == Op.getOperand(2);

55927

})) {

55928

return DAG.getNode(Op0.getOpcode(), DL, VT,

55929

ConcatSubOperand(VT, Ops, 0),

55930

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

55931

}

55932

break;

55933

case ISD::VSELECT:

55934

if (!IsSplat && Subtarget.hasAVX512() &&

55935

(VT.is256BitVector() ||

55936

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

55937

(EltSizeInBits >= 32 || Subtarget.hasBWI())) {

55938

EVT SelVT = Ops[0].getOperand(0).getValueType();

55939

if (SelVT.getVectorElementType() == MVT::i1) {

55940

SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

55941

Ops.size() * SelVT.getVectorNumElements());

55942

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

55943

return DAG.getNode(Op0.getOpcode(), DL, VT,

55944

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

55945

ConcatSubOperand(VT, Ops, 1),

55946

ConcatSubOperand(VT, Ops, 2));

55947

}

55948

}

55949

[[fallthrough]];

55950

case X86ISD::BLENDV:

55951

if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&

55952

(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&

55953

IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {

55954

EVT SelVT = Ops[0].getOperand(0).getValueType();

55955

SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());

55956

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

55957

return DAG.getNode(Op0.getOpcode(), DL, VT,

55958

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

55959

ConcatSubOperand(VT, Ops, 1),

55960

ConcatSubOperand(VT, Ops, 2));

55961

}

55962

break;

55963

}

55964

}

55965

55966

// Fold subvector loads into one.

55967

// If needed, look through bitcasts to get to the load.

55968

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

55969

unsigned Fast;

55970

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

55971

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

55972

*FirstLd->getMemOperand(), &Fast) &&

55973

Fast) {

55974

if (SDValue Ld =

55975

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

55976

return Ld;

55977

}

55978

}

55979

55980

// Attempt to fold target constant loads.

55981

if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

55982

SmallVector<APInt> EltBits;

55983

APInt UndefElts = APInt::getZero(VT.getVectorNumElements());

55984

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

55985

APInt OpUndefElts;

55986

SmallVector<APInt> OpEltBits;

55987

if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

55988

OpEltBits, true, false))

55989

break;

55990

EltBits.append(OpEltBits);

55991

UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

55992

}

55993

if (EltBits.size() == VT.getVectorNumElements())

55994

return getConstVector(EltBits, UndefElts, VT, DAG, DL);

55995

}

55996

55997

return SDValue();

55998

}

55999

56000

static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,

56001

TargetLowering::DAGCombinerInfo &DCI,

56002

const X86Subtarget &Subtarget) {

56003

EVT VT = N->getValueType(0);

56004

EVT SrcVT = N->getOperand(0).getValueType();

56005

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56006

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

56007

56008

if (VT.getVectorElementType() == MVT::i1) {

56009

// Attempt to constant fold.

56010

unsigned SubSizeInBits = SrcVT.getSizeInBits();

56011

APInt Constant = APInt::getZero(VT.getSizeInBits());

56012

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56013

auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));

56014

if (!C) break;

56015

Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);

56016

if (I == (E - 1)) {

56017

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

56018

if (TLI.isTypeLegal(IntVT))

56019

return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));

56020

}

56021

}

56022

56023

// Don't do anything else for i1 vectors.

56024

return SDValue();

56025

}

56026

56027

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

56028

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

56029

DCI, Subtarget))

56030

return R;

56031

}

56032

56033

return SDValue();

56034

}

56035

56036

static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56037

TargetLowering::DAGCombinerInfo &DCI,

56038

const X86Subtarget &Subtarget) {

56039

if (DCI.isBeforeLegalizeOps())

56040

return SDValue();

56041

56042

MVT OpVT = N->getSimpleValueType(0);

56043

56044

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

56045

56046

SDLoc dl(N);

56047

SDValue Vec = N->getOperand(0);

56048

SDValue SubVec = N->getOperand(1);

56049

56050

uint64_t IdxVal = N->getConstantOperandVal(2);

56051

MVT SubVecVT = SubVec.getSimpleValueType();

56052

56053

if (Vec.isUndef() && SubVec.isUndef())

56054

return DAG.getUNDEF(OpVT);

56055

56056

// Inserting undefs/zeros into zeros/undefs is a zero vector.

56057

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

56058

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

56059

return getZeroVector(OpVT, Subtarget, DAG, dl);

56060

56061

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

56062

// If we're inserting into a zero vector and then into a larger zero vector,

56063

// just insert into the larger zero vector directly.

56064

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56065

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

56066

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

56067

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56068

getZeroVector(OpVT, Subtarget, DAG, dl),

56069

SubVec.getOperand(1),

56070

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

56071

}

56072

56073

// If we're inserting into a zero vector and our input was extracted from an

56074

// insert into a zero vector of the same type and the extraction was at

56075

// least as large as the original insertion. Just insert the original

56076

// subvector into a zero vector.

56077

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

56078

isNullConstant(SubVec.getOperand(1)) &&

56079

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

56080

SDValue Ins = SubVec.getOperand(0);

56081

if (isNullConstant(Ins.getOperand(2)) &&

56082

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

56083

Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=

56084

SubVecVT.getFixedSizeInBits())

56085

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56086

getZeroVector(OpVT, Subtarget, DAG, dl),

56087

Ins.getOperand(1), N->getOperand(2));

56088

}

56089

}

56090

56091

// Stop here if this is an i1 vector.

56092

if (IsI1Vector)

56093

return SDValue();

56094

56095

// Eliminate an intermediate vector widening:

56096

// insert_subvector X, (insert_subvector undef, Y, 0), Idx -->

56097

// insert_subvector X, Y, Idx

56098

// TODO: This is a more general version of a DAGCombiner fold, can we move it

56099

// there?

56100

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56101

SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))

56102

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,

56103

SubVec.getOperand(1), N->getOperand(2));

56104

56105

// If this is an insert of an extract, combine to a shuffle. Don't do this

56106

// if the insert or extract can be represented with a subregister operation.

56107

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56108

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

56109

(IdxVal != 0 ||

56110

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

56111

int ExtIdxVal = SubVec.getConstantOperandVal(1);

56112

if (ExtIdxVal != 0) {

56113

int VecNumElts = OpVT.getVectorNumElements();

56114

int SubVecNumElts = SubVecVT.getVectorNumElements();

56115

SmallVector<int, 64> Mask(VecNumElts);

56116

// First create an identity shuffle mask.

56117

for (int i = 0; i != VecNumElts; ++i)

56118

Mask[i] = i;

56119

// Now insert the extracted portion.

56120

for (int i = 0; i != SubVecNumElts; ++i)

56121

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

56122

56123

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

56124

}

56125

}

56126

56127

// Match concat_vector style patterns.

56128

SmallVector<SDValue, 2> SubVectorOps;

56129

if (collectConcatOps(N, SubVectorOps, DAG)) {

56130

if (SDValue Fold =

56131

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

56132

return Fold;

56133

56134

// If we're inserting all zeros into the upper half, change this to

56135

// a concat with zero. We will match this to a move

56136

// with implicit upper bit zeroing during isel.

56137

// We do this here because we don't want combineConcatVectorOps to

56138

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

56139

if (SubVectorOps.size() == 2 &&

56140

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

56141

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56142

getZeroVector(OpVT, Subtarget, DAG, dl),

56143

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

56144

}

56145

56146

// If this is a broadcast insert into an upper undef, use a larger broadcast.

56147

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

56148

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

56149

56150

// If this is a broadcast load inserted into an upper undef, use a larger

56151

// broadcast load.

56152

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

56153

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

56154

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

56155

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

56156

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

56157

SDValue BcastLd =

56158

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

56159

MemIntr->getMemoryVT(),

56160

MemIntr->getMemOperand());

56161

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

56162

return BcastLd;

56163

}

56164

56165

// If we're splatting the lower half subvector of a full vector load into the

56166

// upper half, attempt to create a subvector broadcast.

56167

if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&

56168

Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

56169

auto *VecLd = dyn_cast<LoadSDNode>(Vec);

56170

auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

56171

if (VecLd && SubLd &&

56172

DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,

56173

SubVec.getValueSizeInBits() / 8, 0))

56174

return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,

56175

SubLd, 0, DAG);

56176

}

56177

56178

return SDValue();

56179

}

56180

56181

/// If we are extracting a subvector of a vector select and the select condition

56182

/// is composed of concatenated vectors, try to narrow the select width. This

56183

/// is a common pattern for AVX1 integer code because 256-bit selects may be

56184

/// legal, but there is almost no integer math/logic available for 256-bit.

56185

/// This function should only be called with legal types (otherwise, the calls

56186

/// to get simple value types will assert).

56187

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

56188

SDValue Sel = Ext->getOperand(0);

56189

SmallVector<SDValue, 4> CatOps;

56190

if (Sel.getOpcode() != ISD::VSELECT ||

56191

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))

56192

return SDValue();

56193

56194

// Note: We assume simple value types because this should only be called with

56195

// legal operations/types.

56196

// TODO: This can be extended to handle extraction to 256-bits.

56197

MVT VT = Ext->getSimpleValueType(0);

56198

if (!VT.is128BitVector())

56199

return SDValue();

56200

56201

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

56202

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

56203

return SDValue();

56204

56205

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

56206

MVT SelVT = Sel.getSimpleValueType();

56207

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56208, __extension__
__PRETTY_FUNCTION__))

56208

"Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56208, __extension__
__PRETTY_FUNCTION__));

56209

56210

unsigned SelElts = SelVT.getVectorNumElements();

56211

unsigned CastedElts = WideVT.getVectorNumElements();

56212

unsigned ExtIdx = Ext->getConstantOperandVal(1);

56213

if (SelElts % CastedElts == 0) {

56214

// The select has the same or more (narrower) elements than the extract

56215

// operand. The extraction index gets scaled by that factor.

56216

ExtIdx *= (SelElts / CastedElts);

56217

} else if (CastedElts % SelElts == 0) {

56218

// The select has less (wider) elements than the extract operand. Make sure

56219

// that the extraction index can be divided evenly.

56220

unsigned IndexDivisor = CastedElts / SelElts;

56221

if (ExtIdx % IndexDivisor != 0)

56222

return SDValue();

56223

ExtIdx /= IndexDivisor;

56224

} else {

56225

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56225);

56226

}

56227

56228

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

56229

unsigned NarrowElts = SelElts / NarrowingFactor;

56230

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

56231

SDLoc DL(Ext);

56232

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

56233

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

56234

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

56235

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

56236

return DAG.getBitcast(VT, NarrowSel);

56237

}

56238

56239

static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56240

TargetLowering::DAGCombinerInfo &DCI,

56241

const X86Subtarget &Subtarget) {

56242

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

56243

// eventually get combined/lowered into ANDNP) with a concatenated operand,

56244

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

56245

// We let generic combining take over from there to simplify the

56246

// insert/extract and 'not'.

56247

// This pattern emerges during AVX1 legalization. We handle it before lowering

56248

// to avoid complications like splitting constant vector loads.

56249

56250

// Capture the original wide type in the likely case that we need to bitcast

56251

// back to this type.

56252

if (!N->getValueType(0).isSimple())

56253

return SDValue();

56254

56255

MVT VT = N->getSimpleValueType(0);

56256

SDValue InVec = N->getOperand(0);

56257

unsigned IdxVal = N->getConstantOperandVal(1);

56258

SDValue InVecBC = peekThroughBitcasts(InVec);

56259

EVT InVecVT = InVec.getValueType();

56260

unsigned SizeInBits = VT.getSizeInBits();

56261

unsigned InSizeInBits = InVecVT.getSizeInBits();

56262

unsigned NumSubElts = VT.getVectorNumElements();

56263

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56264

56265

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

56266

TLI.isTypeLegal(InVecVT) &&

56267

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

56268

auto isConcatenatedNot = [](SDValue V) {

56269

V = peekThroughBitcasts(V);

56270

if (!isBitwiseNot(V))

56271

return false;

56272

SDValue NotOp = V->getOperand(0);

56273

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

56274

};

56275

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

56276

isConcatenatedNot(InVecBC.getOperand(1))) {

56277

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

56278

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

56279

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

56280

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

56281

}

56282

}

56283

56284

if (DCI.isBeforeLegalizeOps())

56285

return SDValue();

56286

56287

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

56288

return V;

56289

56290

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

56291

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

56292

56293

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

56294

if (VT.getScalarType() == MVT::i1)

56295

return DAG.getConstant(1, SDLoc(N), VT);

56296

return getOnesVector(VT, DAG, SDLoc(N));

56297

}

56298

56299

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

56300

return DAG.getBuildVector(VT, SDLoc(N),

56301

InVec->ops().slice(IdxVal, NumSubElts));

56302

56303

// If we are extracting from an insert into a larger vector, replace with a

56304

// smaller insert if we don't access less than the original subvector. Don't

56305

// do this for i1 vectors.

56306

// TODO: Relax the matching indices requirement?

56307

if (VT.getVectorElementType() != MVT::i1 &&

56308

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&

56309

IdxVal == InVec.getConstantOperandVal(2) &&

56310

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

56311

SDLoc DL(N);

56312

SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

56313

InVec.getOperand(0), N->getOperand(1));

56314

unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;

56315

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,

56316

InVec.getOperand(1),

56317

DAG.getVectorIdxConstant(NewIdxVal, DL));

56318

}

56319

56320

// If we're extracting an upper subvector from a broadcast we should just

56321

// extract the lowest subvector instead which should allow

56322

// SimplifyDemandedVectorElts do more simplifications.

56323

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

56324

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||

56325

DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))

56326

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

56327

56328

// If we're extracting a broadcasted subvector, just use the lowest subvector.

56329

if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

56330

cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)

56331

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

56332

56333

// Attempt to extract from the source of a shuffle vector.

56334

if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {

56335

SmallVector<int, 32> ShuffleMask;

56336

SmallVector<int, 32> ScaledMask;

56337

SmallVector<SDValue, 2> ShuffleInputs;

56338

unsigned NumSubVecs = InSizeInBits / SizeInBits;

56339

// Decode the shuffle mask and scale it so its shuffling subvectors.

56340

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

56341

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

56342

unsigned SubVecIdx = IdxVal / NumSubElts;

56343

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

56344

return DAG.getUNDEF(VT);

56345

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

56346

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

56347

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

56348

if (Src.getValueSizeInBits() == InSizeInBits) {

56349

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

56350

unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;

56351

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

56352

SDLoc(N), SizeInBits);

56353

}

56354

}

56355

}

56356

56357

// If we're extracting the lowest subvector and we're the only user,

56358

// we may be able to perform this with a smaller vector width.

56359

unsigned InOpcode = InVec.getOpcode();

56360

if (InVec.hasOneUse()) {

56361

if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

56362

// v2f64 CVTDQ2PD(v4i32).

56363

if (InOpcode == ISD::SINT_TO_FP &&

56364

InVec.getOperand(0).getValueType() == MVT::v4i32) {

56365

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

56366

}

56367

// v2f64 CVTUDQ2PD(v4i32).

56368

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

56369

InVec.getOperand(0).getValueType() == MVT::v4i32) {

56370

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

56371

}

56372

// v2f64 CVTPS2PD(v4f32).

56373

if (InOpcode == ISD::FP_EXTEND &&

56374

InVec.getOperand(0).getValueType() == MVT::v4f32) {

56375

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

56376

}

56377

}

56378

if (IdxVal == 0 &&

56379

(InOpcode == ISD::ANY_EXTEND ||

56380

InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||

56381

InOpcode == ISD::ZERO_EXTEND ||

56382

InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||

56383

InOpcode == ISD::SIGN_EXTEND ||

56384

InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&

56385

(SizeInBits == 128 || SizeInBits == 256) &&

56386

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

56387

SDLoc DL(N);

56388

SDValue Ext = InVec.getOperand(0);

56389

if (Ext.getValueSizeInBits() > SizeInBits)

56390

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

56391

unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);

56392

return DAG.getNode(ExtOp, DL, VT, Ext);

56393

}

56394

if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

56395

InVec.getOperand(0).getValueType().is256BitVector() &&

56396

InVec.getOperand(1).getValueType().is256BitVector() &&

56397

InVec.getOperand(2).getValueType().is256BitVector()) {

56398

SDLoc DL(N);

56399

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

56400

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

56401

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

56402

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

56403

}

56404

if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

56405

(VT.is128BitVector() || VT.is256BitVector())) {

56406

SDLoc DL(N);

56407

SDValue InVecSrc = InVec.getOperand(0);

56408

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

56409

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

56410

return DAG.getNode(InOpcode, DL, VT, Ext);

56411

}

56412

if (InOpcode == X86ISD::MOVDDUP &&

56413

(VT.is128BitVector() || VT.is256BitVector())) {

56414

SDLoc DL(N);

56415

SDValue Ext0 =

56416

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

56417

return DAG.getNode(InOpcode, DL, VT, Ext0);

56418

}

56419

}

56420

56421

// Always split vXi64 logical shifts where we're extracting the upper 32-bits

56422

// as this is very likely to fold into a shuffle/truncation.

56423

if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

56424

InVecVT.getScalarSizeInBits() == 64 &&

56425

InVec.getConstantOperandAPInt(1) == 32) {

56426

SDLoc DL(N);

56427

SDValue Ext =

56428

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

56429

return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

56430

}

56431

56432

return SDValue();

56433

}

56434

56435

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

56436

EVT VT = N->getValueType(0);

56437

SDValue Src = N->getOperand(0);

56438

SDLoc DL(N);

56439

56440

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

56441

// This occurs frequently in our masked scalar intrinsic code and our

56442

// floating point select lowering with AVX512.

56443

// TODO: SimplifyDemandedBits instead?

56444

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())

56445

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

56446

if (C->getAPIntValue().isOne())

56447

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,

56448

Src.getOperand(0));

56449

56450

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

56451

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

56452

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

56453

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

56454

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

56455

if (C->isZero())

56456

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

56457

Src.getOperand(1));

56458

56459

// Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.

56460

// TODO: Move to DAGCombine/SimplifyDemandedBits?

56461

if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {

56462

auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {

56463

if (Op.getValueType() != MVT::i64)

56464

return SDValue();

56465

unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;

56466

if (Op.getOpcode() == Opc &&

56467

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

56468

return Op.getOperand(0);

56469

unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;

56470

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

56471

if (Ld->getExtensionType() == Ext &&

56472

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

56473

return Op;

56474

if (IsZeroExt) {

56475

KnownBits Known = DAG.computeKnownBits(Op);

56476

if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)

56477

return Op;

56478

}

56479

return SDValue();

56480

};

56481

56482

if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))

56483

return DAG.getBitcast(

56484

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

56485

DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));

56486

56487

if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))

56488

return DAG.getBitcast(

56489

VT,

56490

DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,

56491

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

56492

DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));

56493

}

56494

56495

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

56496

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

56497

Src.getOperand(0).getValueType() == MVT::x86mmx)

56498

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

56499

56500

// See if we're broadcasting the scalar value, in which case just reuse that.

56501

// Ensure the same SDValue from the SDNode use is being used.

56502

if (VT.getScalarType() == Src.getValueType())

56503

for (SDNode *User : Src->uses())

56504

if (User->getOpcode() == X86ISD::VBROADCAST &&

56505

Src == User->getOperand(0)) {

56506

unsigned SizeInBits = VT.getFixedSizeInBits();

56507

unsigned BroadcastSizeInBits =

56508

User->getValueSizeInBits(0).getFixedValue();

56509

if (BroadcastSizeInBits == SizeInBits)

56510

return SDValue(User, 0);

56511

if (BroadcastSizeInBits > SizeInBits)

56512

return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

56513

// TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

56514

// coverage.

56515

}

56516

56517

return SDValue();

56518

}

56519

56520

// Simplify PMULDQ and PMULUDQ operations.

56521

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

56522

TargetLowering::DAGCombinerInfo &DCI,

56523

const X86Subtarget &Subtarget) {

56524

SDValue LHS = N->getOperand(0);

56525

SDValue RHS = N->getOperand(1);

56526

56527

// Canonicalize constant to RHS.

56528

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

56529

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

56530

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

56531

56532

// Multiply by zero.

56533

// Don't return RHS as it may contain UNDEFs.

56534

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

56535

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

56536

56537

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

56538

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56539

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

56540

return SDValue(N, 0);

56541

56542

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

56543

// convert it to any_extend_invec, due to the LegalOperations check, do the

56544

// conversion directly to a vector shuffle manually. This exposes combine

56545

// opportunities missed by combineEXTEND_VECTOR_INREG not calling

56546

// combineX86ShufflesRecursively on SSE4.1 targets.

56547

// FIXME: This is basically a hack around several other issues related to

56548

// ANY_EXTEND_VECTOR_INREG.

56549

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

56550

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

56551

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

56552

LHS.getOperand(0).getValueType() == MVT::v4i32) {

56553

SDLoc dl(N);

56554

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

56555

LHS.getOperand(0), { 0, -1, 1, -1 });

56556

LHS = DAG.getBitcast(MVT::v2i64, LHS);

56557

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

56558

}

56559

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

56560

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

56561

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

56562

RHS.getOperand(0).getValueType() == MVT::v4i32) {

56563

SDLoc dl(N);

56564

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

56565

RHS.getOperand(0), { 0, -1, 1, -1 });

56566

RHS = DAG.getBitcast(MVT::v2i64, RHS);

56567

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

56568

}

56569

56570

return SDValue();

56571

}

56572

56573

// Simplify VPMADDUBSW/VPMADDWD operations.

56574

static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

56575

TargetLowering::DAGCombinerInfo &DCI) {

56576

EVT VT = N->getValueType(0);

56577

SDValue LHS = N->getOperand(0);

56578

SDValue RHS = N->getOperand(1);

56579

56580

// Multiply by zero.

56581

// Don't return LHS/RHS as it may contain UNDEFs.

56582

if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

56583

ISD::isBuildVectorAllZeros(RHS.getNode()))

56584

return DAG.getConstant(0, SDLoc(N), VT);

56585

56586

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56587

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

56588

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

56589

return SDValue(N, 0);

56590

56591

return SDValue();

56592

}

56593

56594

static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

56595

TargetLowering::DAGCombinerInfo &DCI,

56596

const X86Subtarget &Subtarget) {

56597

EVT VT = N->getValueType(0);

56598

SDValue In = N->getOperand(0);

56599

unsigned Opcode = N->getOpcode();

56600

unsigned InOpcode = In.getOpcode();

56601

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56602

SDLoc DL(N);

56603

56604

// Try to merge vector loads and extend_inreg to an extload.

56605

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

56606

In.hasOneUse()) {

56607

auto *Ld = cast<LoadSDNode>(In);

56608

if (Ld->isSimple()) {

56609

MVT SVT = In.getSimpleValueType().getVectorElementType();

56610

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

56611

? ISD::SEXTLOAD

56612

: ISD::ZEXTLOAD;

56613

EVT MemVT = VT.changeVectorElementType(SVT);

56614

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

56615

SDValue Load = DAG.getExtLoad(

56616

Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

56617

MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());

56618

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

56619

return Load;

56620

}

56621

}

56622

}

56623

56624

// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

56625

if (Opcode == InOpcode)

56626

return DAG.getNode(Opcode, DL, VT, In.getOperand(0));

56627

56628

// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

56629

// -> EXTEND_VECTOR_INREG(X).

56630

// TODO: Handle non-zero subvector indices.

56631

if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

56632

In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&

56633

In.getOperand(0).getOperand(0).getValueSizeInBits() ==

56634

In.getValueSizeInBits())

56635

return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));

56636

56637

// Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

56638

// TODO: Move to DAGCombine?

56639

if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

56640

In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

56641

In.getValueSizeInBits() == VT.getSizeInBits()) {

56642

unsigned NumElts = VT.getVectorNumElements();

56643

unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

56644

EVT EltVT = In.getOperand(0).getValueType();

56645

SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

56646

for (unsigned I = 0; I != NumElts; ++I)

56647

Elts[I * Scale] = In.getOperand(I);

56648

return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

56649

}

56650

56651

// Attempt to combine as a shuffle on SSE41+ targets.

56652

if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

56653

Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&

56654

Subtarget.hasSSE41()) {

56655

SDValue Op(N, 0);

56656

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

56657

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

56658

return Res;

56659

}

56660

56661

return SDValue();

56662

}

56663

56664

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

56665

TargetLowering::DAGCombinerInfo &DCI) {

56666

EVT VT = N->getValueType(0);

56667

56668

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

56669

return DAG.getConstant(0, SDLoc(N), VT);

56670

56671

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56672

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

56673

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

56674

return SDValue(N, 0);

56675

56676

return SDValue();

56677

}

56678

56679

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

56680

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

56681

// extra instructions between the conversion due to going to scalar and back.

56682

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

56683

const X86Subtarget &Subtarget) {

56684

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

56685

return SDValue();

56686

56687

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

56688

return SDValue();

56689

56690

if (N->getValueType(0) != MVT::f32 ||

56691

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

56692

return SDValue();

56693

56694

SDLoc dl(N);

56695

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

56696

N->getOperand(0).getOperand(0));

56697

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

56698

DAG.getTargetConstant(4, dl, MVT::i32));

56699

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

56700

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

56701

DAG.getIntPtrConstant(0, dl));

56702

}

56703

56704

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

56705

const X86Subtarget &Subtarget) {

56706

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

56707

return SDValue();

56708

56709

if (Subtarget.hasFP16())

56710

return SDValue();

56711

56712

bool IsStrict = N->isStrictFPOpcode();

56713

EVT VT = N->getValueType(0);

56714

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

56715

EVT SrcVT = Src.getValueType();

56716

56717

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

56718

return SDValue();

56719

56720

if (VT.getVectorElementType() != MVT::f32 &&

56721

VT.getVectorElementType() != MVT::f64)

56722

return SDValue();

56723

56724

unsigned NumElts = VT.getVectorNumElements();

56725

if (NumElts == 1 || !isPowerOf2_32(NumElts))

56726

return SDValue();

56727

56728

SDLoc dl(N);

56729

56730

// Convert the input to vXi16.

56731

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

56732

Src = DAG.getBitcast(IntVT, Src);

56733

56734

// Widen to at least 8 input elements.

56735

if (NumElts < 8) {

56736

unsigned NumConcats = 8 / NumElts;

56737

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

56738

: DAG.getConstant(0, dl, IntVT);

56739

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

56740

Ops[0] = Src;

56741

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

56742

}

56743

56744

// Destination is vXf32 with at least 4 elements.

56745

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

56746

std::max(4U, NumElts));

56747

SDValue Cvt, Chain;

56748

if (IsStrict) {

56749

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

56750

{N->getOperand(0), Src});

56751

Chain = Cvt.getValue(1);

56752

} else {

56753

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

56754

}

56755

56756

if (NumElts < 4) {

56757

assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56757, __extension__
__PRETTY_FUNCTION__));

56758

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

56759

DAG.getIntPtrConstant(0, dl));

56760

}

56761

56762

if (IsStrict) {

56763

// Extend to the original VT if necessary.

56764

if (Cvt.getValueType() != VT) {

56765

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

56766

{Chain, Cvt});

56767

Chain = Cvt.getValue(1);

56768

}

56769

return DAG.getMergeValues({Cvt, Chain}, dl);

56770

}

56771

56772

// Extend to the original VT if necessary.

56773

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

56774

}

56775

56776

// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract

56777

// from. Limit this to cases where the loads have the same input chain and the

56778

// output chains are unused. This avoids any memory ordering issues.

56779

static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

56780

TargetLowering::DAGCombinerInfo &DCI) {

56781

assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56783, __extension__
__PRETTY_FUNCTION__))

56782

N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56783, __extension__
__PRETTY_FUNCTION__))

56783

"Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56783, __extension__
__PRETTY_FUNCTION__));

56784

56785

// Only do this if the chain result is unused.

56786

if (N->hasAnyUseOfValue(1))

56787

return SDValue();

56788

56789

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

56790

56791

SDValue Ptr = MemIntrin->getBasePtr();

56792

SDValue Chain = MemIntrin->getChain();

56793

EVT VT = N->getSimpleValueType(0);

56794

EVT MemVT = MemIntrin->getMemoryVT();

56795

56796

// Look at other users of our base pointer and try to find a wider broadcast.

56797

// The input chain and the size of the memory VT must match.

56798

for (SDNode *User : Ptr->uses())

56799

if (User != N && User->getOpcode() == N->getOpcode() &&

56800

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

56801

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

56802

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

56803

MemVT.getSizeInBits() &&

56804

!User->hasAnyUseOfValue(1) &&

56805

User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {

56806

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

56807

VT.getSizeInBits());

56808

Extract = DAG.getBitcast(VT, Extract);

56809

return DCI.CombineTo(N, Extract, SDValue(User, 1));

56810

}

56811

56812

return SDValue();

56813

}

56814

56815

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

56816

const X86Subtarget &Subtarget) {

56817

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

56818

return SDValue();

56819

56820

bool IsStrict = N->isStrictFPOpcode();

56821

EVT VT = N->getValueType(0);

56822

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

56823

EVT SrcVT = Src.getValueType();

56824

56825

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

56826

SrcVT.getVectorElementType() != MVT::f32)

56827

return SDValue();

56828

56829

SDLoc dl(N);

56830

56831

SDValue Cvt, Chain;

56832

unsigned NumElts = VT.getVectorNumElements();

56833

if (Subtarget.hasFP16()) {

56834

// Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))

56835

// into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))

56836

if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {

56837

SDValue Cvt0, Cvt1;

56838

SDValue Op0 = Src.getOperand(0);

56839

SDValue Op1 = Src.getOperand(1);

56840

bool IsOp0Strict = Op0->isStrictFPOpcode();

56841

if (Op0.getOpcode() != Op1.getOpcode() ||

56842

Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||

56843

Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {

56844

return SDValue();

56845

}

56846

int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};

56847

if (IsStrict) {

56848

assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56848, __extension__
__PRETTY_FUNCTION__));

56849

unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP

56850

? X86ISD::STRICT_CVTSI2P

56851

: X86ISD::STRICT_CVTUI2P;

56852

Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

56853

{Op0.getOperand(0), Op0.getOperand(1)});

56854

Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

56855

{Op1.getOperand(0), Op1.getOperand(1)});

56856

Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

56857

return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);

56858

}

56859

unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P

56860

: X86ISD::CVTUI2P;

56861

Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));

56862

Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));

56863

return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

56864

}

56865

return SDValue();

56866

}

56867

56868

if (NumElts == 1 || !isPowerOf2_32(NumElts))

56869

return SDValue();

56870

56871

// Widen to at least 4 input elements.

56872

if (NumElts < 4)

56873

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

56874

DAG.getConstantFP(0.0, dl, SrcVT));

56875

56876

// Destination is v8i16 with at least 8 elements.

56877

EVT CvtVT =

56878

EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));

56879

SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);

56880

if (IsStrict) {

56881

Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},

56882

{N->getOperand(0), Src, Rnd});

56883

Chain = Cvt.getValue(1);

56884

} else {

56885

Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);

56886

}

56887

56888

// Extract down to real number of elements.

56889

if (NumElts < 8) {

56890

EVT IntVT = VT.changeVectorElementTypeToInteger();

56891

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

56892

DAG.getIntPtrConstant(0, dl));

56893

}

56894

56895

Cvt = DAG.getBitcast(VT, Cvt);

56896

56897

if (IsStrict)

56898

return DAG.getMergeValues({Cvt, Chain}, dl);

56899

56900

return Cvt;

56901

}

56902

56903

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

56904

SDValue Src = N->getOperand(0);

56905

56906

// Turn MOVDQ2Q+simple_load into an mmx load.

56907

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

56908

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

56909

56910

if (LN->isSimple()) {

56911

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

56912

LN->getBasePtr(),

56913

LN->getPointerInfo(),

56914

LN->getOriginalAlign(),

56915

LN->getMemOperand()->getFlags());

56916

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

56917

return NewLd;

56918

}

56919

}

56920

56921

return SDValue();

56922

}

56923

56924

static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

56925

TargetLowering::DAGCombinerInfo &DCI) {

56926

unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

56927

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56928

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

56929

return SDValue(N, 0);

56930

56931

return SDValue();

56932

}

56933

56934

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

56935

DAGCombinerInfo &DCI) const {

56936

SelectionDAG &DAG = DCI.DAG;

56937

switch (N->getOpcode()) {

56938

default: break;

56939

case ISD::SCALAR_TO_VECTOR:

56940

return combineScalarToVector(N, DAG);

56941

case ISD::EXTRACT_VECTOR_ELT:

56942

case X86ISD::PEXTRW:

56943

case X86ISD::PEXTRB:

56944

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

56945

case ISD::CONCAT_VECTORS:

56946

return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);

56947

case ISD::INSERT_SUBVECTOR:

56948

return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);

56949

case ISD::EXTRACT_SUBVECTOR:

56950

return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);

56951

case ISD::VSELECT:

56952

case ISD::SELECT:

56953

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

56954

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

56955

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

56956

case X86ISD::CMP: return combineCMP(N, DAG);

56957

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

56958

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

56959

case X86ISD::ADD:

56960

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

56961

case X86ISD::SBB: return combineSBB(N, DAG);

56962

case X86ISD::ADC: return combineADC(N, DAG, DCI);

56963

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

56964

case ISD::SHL: return combineShiftLeft(N, DAG);

56965

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

56966

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

56967

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

56968

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

56969

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

56970

case X86ISD::BEXTR:

56971

case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);

56972

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

56973

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

56974

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

56975

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

56976

case X86ISD::VEXTRACT_STORE:

56977

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

56978

case ISD::SINT_TO_FP:

56979

case ISD::STRICT_SINT_TO_FP:

56980

return combineSIntToFP(N, DAG, DCI, Subtarget);

56981

case ISD::UINT_TO_FP:

56982

case ISD::STRICT_UINT_TO_FP:

56983

return combineUIntToFP(N, DAG, Subtarget);

56984

case ISD::FADD:

56985

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

56986

case X86ISD::VFCMULC:

56987

case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);

56988

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

56989

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

56990

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

56991

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

56992

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

56993

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

56994

case X86ISD::FXOR:

56995

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

56996

case X86ISD::FMIN:

56997

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

56998

case ISD::FMINNUM:

56999

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

57000

case X86ISD::CVTSI2P:

57001

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

57002

case X86ISD::CVTP2SI:

57003

case X86ISD::CVTP2UI:

57004

case X86ISD::STRICT_CVTTP2SI:

57005

case X86ISD::CVTTP2SI:

57006

case X86ISD::STRICT_CVTTP2UI:

57007

case X86ISD::CVTTP2UI:

57008

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

57009

case X86ISD::STRICT_CVTPH2PS:

57010

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

57011

case X86ISD::BT: return combineBT(N, DAG, DCI);

57012

case ISD::ANY_EXTEND:

57013

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

57014

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

57015

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

57016

case ISD::ANY_EXTEND_VECTOR_INREG:

57017

case ISD::SIGN_EXTEND_VECTOR_INREG:

57018

case ISD::ZERO_EXTEND_VECTOR_INREG:

57019

return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

57020

case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);

57021

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

57022

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

57023

case X86ISD::PACKSS:

57024

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

57025

case X86ISD::HADD:

57026

case X86ISD::HSUB:

57027

case X86ISD::FHADD:

57028

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

57029

case X86ISD::VSHL:

57030

case X86ISD::VSRA:

57031

case X86ISD::VSRL:

57032

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

57033

case X86ISD::VSHLI:

57034

case X86ISD::VSRAI:

57035

case X86ISD::VSRLI:

57036

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

57037

case ISD::INSERT_VECTOR_ELT:

57038

case X86ISD::PINSRB:

57039

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

57040

case X86ISD::SHUFP: // Handle all target specific shuffles

57041

case X86ISD::INSERTPS:

57042

case X86ISD::EXTRQI:

57043

case X86ISD::INSERTQI:

57044

case X86ISD::VALIGN:

57045

case X86ISD::PALIGNR:

57046

case X86ISD::VSHLDQ:

57047

case X86ISD::VSRLDQ:

57048

case X86ISD::BLENDI:

57049

case X86ISD::UNPCKH:

57050

case X86ISD::UNPCKL:

57051

case X86ISD::MOVHLPS:

57052

case X86ISD::MOVLHPS:

57053

case X86ISD::PSHUFB:

57054

case X86ISD::PSHUFD:

57055

case X86ISD::PSHUFHW:

57056

case X86ISD::PSHUFLW:

57057

case X86ISD::MOVSHDUP:

57058

case X86ISD::MOVSLDUP:

57059

case X86ISD::MOVDDUP:

57060

case X86ISD::MOVSS:

57061

case X86ISD::MOVSD:

57062

case X86ISD::MOVSH:

57063

case X86ISD::VBROADCAST:

57064

case X86ISD::VPPERM:

57065

case X86ISD::VPERMI:

57066

case X86ISD::VPERMV:

57067

case X86ISD::VPERMV3:

57068

case X86ISD::VPERMIL2:

57069

case X86ISD::VPERMILPI:

57070

case X86ISD::VPERMILPV:

57071

case X86ISD::VPERM2X128:

57072

case X86ISD::SHUF128:

57073

case X86ISD::VZEXT_MOVL:

57074

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

57075

case X86ISD::FMADD_RND:

57076

case X86ISD::FMSUB:

57077

case X86ISD::STRICT_FMSUB:

57078

case X86ISD::FMSUB_RND:

57079

case X86ISD::FNMADD:

57080

case X86ISD::STRICT_FNMADD:

57081

case X86ISD::FNMADD_RND:

57082

case X86ISD::FNMSUB:

57083

case X86ISD::STRICT_FNMSUB:

57084

case X86ISD::FNMSUB_RND:

57085

case ISD::FMA:

57086

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

57087

case X86ISD::FMADDSUB_RND:

57088

case X86ISD::FMSUBADD_RND:

57089

case X86ISD::FMADDSUB:

57090

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

57091

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

57092

case X86ISD::MGATHER:

57093

case X86ISD::MSCATTER:

57094

return combineX86GatherScatter(N, DAG, DCI, Subtarget);

57095

case ISD::MGATHER:

57096

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

57097

case X86ISD::PCMPEQ:

57098

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

57099

case X86ISD::PMULDQ:

57100

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

57101

case X86ISD::VPMADDUBSW:

57102

case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);

57103

case X86ISD::KSHIFTL:

57104

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

57105

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

57106

case ISD::STRICT_FP_EXTEND:

57107

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

57108

case ISD::STRICT_FP_ROUND:

57109

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

57110

case X86ISD::VBROADCAST_LOAD:

57111

case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

57112

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

57113

case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);

57114

}

57115

57116

return SDValue();

57117

}

57118

57119

bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {

57120

return false;

57121

}

57122

57123

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

57124

if (!isTypeLegal(VT))

57125

return false;

57126

57127

// There are no vXi8 shifts.

57128

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

57129

return false;

57130

57131

// TODO: Almost no 8-bit ops are desirable because they have no actual

57132

// size/speed advantages vs. 32-bit ops, but they do have a major

57133

// potential disadvantage by causing partial register stalls.

57134

//

57135

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

57136

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

57137

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

57138

// check for a constant operand to the multiply.

57139

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

57140

return false;

57141

57142

// i16 instruction encodings are longer and some i16 instructions are slow,

57143

// so those are not desirable.

57144

if (VT == MVT::i16) {

57145

switch (Opc) {

57146

default:

57147

break;

57148

case ISD::LOAD:

57149

case ISD::SIGN_EXTEND:

57150

case ISD::ZERO_EXTEND:

57151

case ISD::ANY_EXTEND:

57152

case ISD::SHL:

57153

case ISD::SRA:

57154

case ISD::SRL:

57155

case ISD::SUB:

57156

case ISD::ADD:

57157

case ISD::MUL:

57158

case ISD::AND:

57159

case ISD::OR:

57160

case ISD::XOR:

57161

return false;

57162

}

57163

}

57164

57165

// Any legal type not explicitly accounted for above here is desirable.

57166

return true;

57167

}

57168

57169

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

57170

SDValue Value, SDValue Addr,

57171

SelectionDAG &DAG) const {

57172

const Module *M = DAG.getMachineFunction().getMMI().getModule();

57173

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

57174

if (IsCFProtectionSupported) {

57175

// In case control-flow branch protection is enabled, we need to add

57176

// notrack prefix to the indirect branch.

57177

// In order to do that we create NT_BRIND SDNode.

57178

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

57179

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

57180

}

57181

57182

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

57183

}

57184

57185

TargetLowering::AndOrSETCCFoldKind

57186

X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(

57187

const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {

57188

using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;

57189

EVT VT = LogicOp->getValueType(0);

57190

EVT OpVT = SETCC0->getOperand(0).getValueType();

57191

if (!VT.isInteger())

57192

return AndOrSETCCFoldKind::None;

57193

if (VT.isVector())

57194

return isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS

57195

: AndOrSETCCFoldKind::None;

57196

return AndOrSETCCFoldKind::AddAnd;

57197

}

57198

57199

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

57200

EVT VT = Op.getValueType();

57201

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

57202

isa<ConstantSDNode>(Op.getOperand(1));

57203

57204

// i16 is legal, but undesirable since i16 instruction encodings are longer

57205

// and some i16 instructions are slow.

57206

// 8-bit multiply-by-constant can usually be expanded to something cheaper

57207

// using LEA and/or other ALU ops.

57208

if (VT != MVT::i16 && !Is8BitMulByConstant)

57209

return false;

57210

57211

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

57212

if (!Op.hasOneUse())

57213

return false;

57214

SDNode *User = *Op->use_begin();

57215

if (!ISD::isNormalStore(User))

57216

return false;

57217

auto *Ld = cast<LoadSDNode>(Load);

57218

auto *St = cast<StoreSDNode>(User);

57219

return Ld->getBasePtr() == St->getBasePtr();

57220

};

57221

57222

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

57223

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

57224

return false;

57225

if (!Op.hasOneUse())

57226

return false;

57227

SDNode *User = *Op->use_begin();

57228

if (User->getOpcode() != ISD::ATOMIC_STORE)

57229

return false;

57230

auto *Ld = cast<AtomicSDNode>(Load);

57231

auto *St = cast<AtomicSDNode>(User);

57232

return Ld->getBasePtr() == St->getBasePtr();

57233

};

57234

57235

bool Commute = false;

57236

switch (Op.getOpcode()) {

57237

default: return false;

57238

case ISD::SIGN_EXTEND:

57239

case ISD::ZERO_EXTEND:

57240

case ISD::ANY_EXTEND:

57241

break;

57242

case ISD::SHL:

57243

case ISD::SRA:

57244

case ISD::SRL: {

57245

SDValue N0 = Op.getOperand(0);

57246

// Look out for (store (shl (load), x)).

57247

if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

57248

return false;

57249

break;

57250

}

57251

case ISD::ADD:

57252

case ISD::MUL:

57253

case ISD::AND:

57254

case ISD::OR:

57255

case ISD::XOR:

57256

Commute = true;

57257

[[fallthrough]];

57258

case ISD::SUB: {

57259

SDValue N0 = Op.getOperand(0);

57260

SDValue N1 = Op.getOperand(1);

57261

// Avoid disabling potential load folding opportunities.

57262

if (X86::mayFoldLoad(N1, Subtarget) &&

57263

(!Commute || !isa<ConstantSDNode>(N0) ||

57264

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

57265

return false;

57266

if (X86::mayFoldLoad(N0, Subtarget) &&

57267

((Commute && !isa<ConstantSDNode>(N1)) ||

57268

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

57269

return false;

57270

if (IsFoldableAtomicRMW(N0, Op) ||

57271

(Commute && IsFoldableAtomicRMW(N1, Op)))

57272

return false;

57273

}

57274

}

57275

57276

PVT = MVT::i32;

57277

return true;

57278

}

57279

57280

//===----------------------------------------------------------------------===//

57281

// X86 Inline Assembly Support

57282

//===----------------------------------------------------------------------===//

57283

57284

// Helper to match a string separated by whitespace.

57285

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

57286

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

57287

57288

for (StringRef Piece : Pieces) {

57289

if (!S.startswith(Piece)) // Check if the piece matches.

57290

return false;

57291

57292

S = S.substr(Piece.size());

57293

StringRef::size_type Pos = S.find_first_not_of(" \t");

57294

if (Pos == 0) // We matched a prefix.

57295

return false;

57296

57297

S = S.substr(Pos);

57298

}

57299

57300

return S.empty();

57301

}

57302

57303

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

57304

57305

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

57306

if (llvm::is_contained(AsmPieces, "~{cc}") &&

57307

llvm::is_contained(AsmPieces, "~{flags}") &&

57308

llvm::is_contained(AsmPieces, "~{fpsr}")) {

57309

57310

if (AsmPieces.size() == 3)

57311

return true;

57312

else if (llvm::is_contained(AsmPieces, "~{dirflag}"))

57313

return true;

57314

}

57315

}

57316

return false;

57317

}

57318

57319

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

57320

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

57321

57322

const std::string &AsmStr = IA->getAsmString();

57323

57324

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

57325

if (!Ty || Ty->getBitWidth() % 16 != 0)

57326

return false;

57327

57328

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

57329

SmallVector<StringRef, 4> AsmPieces;

57330

SplitString(AsmStr, AsmPieces, ";\n");

57331

57332

switch (AsmPieces.size()) {

57333

default: return false;

57334

case 1:

57335

// FIXME: this should verify that we are targeting a 486 or better. If not,

57336

// we will turn this bswap into something that will be lowered to logical

57337

// ops instead of emitting the bswap asm. For now, we don't support 486 or

57338

// lower so don't worry about this.

57339

// bswap $0

57340

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

57341

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

57342

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

57343

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

57344

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

57345

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

57346

// No need to check constraints, nothing other than the equivalent of

57347

// "=r,0" would be valid here.

57348

return IntrinsicLowering::LowerToByteSwap(CI);

57349

}

57350

57351

// rorw $$8, ${0:w} --> llvm.bswap.i16

57352

if (CI->getType()->isIntegerTy(16) &&

57353

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

57354

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

57355

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

57356

AsmPieces.clear();

57357

StringRef ConstraintsStr = IA->getConstraintString();

57358

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

57359

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

57360

if (clobbersFlagRegisters(AsmPieces))

57361

return IntrinsicLowering::LowerToByteSwap(CI);

57362

}

57363

break;

57364

case 3:

57365

if (CI->getType()->isIntegerTy(32) &&

57366

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

57367

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

57368

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

57369

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

57370

AsmPieces.clear();

57371

StringRef ConstraintsStr = IA->getConstraintString();

57372

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

57373

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

57374

if (clobbersFlagRegisters(AsmPieces))

57375

return IntrinsicLowering::LowerToByteSwap(CI);

57376

}

57377

57378

if (CI->getType()->isIntegerTy(64)) {

57379

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

57380

if (Constraints.size() >= 2 &&

57381

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

57382

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

57383

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

57384

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

57385

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

57386

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

57387

return IntrinsicLowering::LowerToByteSwap(CI);

57388

}

57389

}

57390

break;

57391

}

57392

return false;

57393

}

57394

57395

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

57396

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

57397

.Case("{@cca}", X86::COND_A)

57398

.Case("{@ccae}", X86::COND_AE)

57399

.Case("{@ccb}", X86::COND_B)

57400

.Case("{@ccbe}", X86::COND_BE)

57401

.Case("{@ccc}", X86::COND_B)

57402

.Case("{@cce}", X86::COND_E)

57403

.Case("{@ccz}", X86::COND_E)

57404

.Case("{@ccg}", X86::COND_G)

57405

.Case("{@ccge}", X86::COND_GE)

57406

.Case("{@ccl}", X86::COND_L)

57407

.Case("{@ccle}", X86::COND_LE)

57408

.Case("{@ccna}", X86::COND_BE)

57409

.Case("{@ccnae}", X86::COND_B)

57410

.Case("{@ccnb}", X86::COND_AE)

57411

.Case("{@ccnbe}", X86::COND_A)

57412

.Case("{@ccnc}", X86::COND_AE)

57413

.Case("{@ccne}", X86::COND_NE)

57414

.Case("{@ccnz}", X86::COND_NE)

57415

.Case("{@ccng}", X86::COND_LE)

57416

.Case("{@ccnge}", X86::COND_L)

57417

.Case("{@ccnl}", X86::COND_GE)

57418

.Case("{@ccnle}", X86::COND_G)

57419

.Case("{@ccno}", X86::COND_NO)

57420

.Case("{@ccnp}", X86::COND_NP)

57421

.Case("{@ccns}", X86::COND_NS)

57422

.Case("{@cco}", X86::COND_O)

57423

.Case("{@ccp}", X86::COND_P)

57424

.Case("{@ccs}", X86::COND_S)

57425

.Default(X86::COND_INVALID);

57426

return Cond;

57427

}

57428

57429

/// Given a constraint letter, return the type of constraint for this target.

57430

X86TargetLowering::ConstraintType

57431

X86TargetLowering::getConstraintType(StringRef Constraint) const {

57432

if (Constraint.size() == 1) {

57433

switch (Constraint[0]) {

57434

case 'R':

57435

case 'q':

57436

case 'Q':

57437

case 'f':

57438

case 't':

57439

case 'u':

57440

case 'y':

57441

case 'x':

57442

case 'v':

57443

case 'l':

57444

case 'k': // AVX512 masking registers.

57445

return C_RegisterClass;

57446

case 'a':

57447

case 'b':

57448

case 'c':

57449

case 'd':

57450

case 'S':

57451

case 'D':

57452

case 'A':

57453

return C_Register;

57454

case 'I':

57455

case 'J':

57456

case 'K':

57457

case 'N':

57458

case 'G':

57459

case 'L':

57460

case 'M':

57461

return C_Immediate;

57462

case 'C':

57463

case 'e':

57464

case 'Z':

57465

return C_Other;

57466

default:

57467

break;

57468

}

57469

}

57470

else if (Constraint.size() == 2) {

57471

switch (Constraint[0]) {

57472

default:

57473

break;

57474

case 'Y':

57475

switch (Constraint[1]) {

57476

default:

57477

break;

57478

case 'z':

57479

return C_Register;

57480

case 'i':

57481

case 'm':

57482

case 'k':

57483

case 't':

57484

case '2':

57485

return C_RegisterClass;

57486

}

57487

}

57488

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

57489

return C_Other;

57490

return TargetLowering::getConstraintType(Constraint);

57491

}

57492

57493

/// Examine constraint type and operand type and determine a weight value.

57494

/// This object must already have been set up with the operand type

57495

/// and the current alternative constraint selected.

57496

TargetLowering::ConstraintWeight

57497

X86TargetLowering::getSingleConstraintMatchWeight(

57498

AsmOperandInfo &info, const char *constraint) const {

57499

ConstraintWeight weight = CW_Invalid;

57500

Value *CallOperandVal = info.CallOperandVal;

57501

// If we don't have a value, we can't do a match,

57502

// but allow it at the lowest weight.

57503

if (!CallOperandVal)

57504

return CW_Default;

57505

Type *type = CallOperandVal->getType();

57506

// Look at the constraint type.

57507

switch (*constraint) {

57508

default:

57509

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

57510

[[fallthrough]];

57511

case 'R':

57512

case 'q':

57513

case 'Q':

57514

case 'a':

57515

case 'b':

57516

case 'c':

57517

case 'd':

57518

case 'S':

57519

case 'D':

57520

case 'A':

57521

if (CallOperandVal->getType()->isIntegerTy())

57522

weight = CW_SpecificReg;

57523

break;

57524

case 'f':

57525

case 't':

57526

case 'u':

57527

if (type->isFloatingPointTy())

57528

weight = CW_SpecificReg;

57529

break;

57530

case 'y':

57531

if (type->isX86_MMXTy() && Subtarget.hasMMX())

57532

weight = CW_SpecificReg;

57533

break;

57534

case 'Y':

57535

if (StringRef(constraint).size() != 2)

57536

break;

57537

switch (constraint[1]) {

57538

default:

57539

return CW_Invalid;

57540

// XMM0

57541

case 'z':

57542

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

57543

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

57544

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

57545

return CW_SpecificReg;

57546

return CW_Invalid;

57547

// Conditional OpMask regs (AVX512)

57548

case 'k':

57549

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

57550

return CW_Register;

57551

return CW_Invalid;

57552

// Any MMX reg

57553

case 'm':

57554

if (type->isX86_MMXTy() && Subtarget.hasMMX())

57555

return weight;

57556

return CW_Invalid;

57557

// Any SSE reg when ISA >= SSE2, same as 'x'

57558

case 'i':

57559

case 't':

57560

case '2':

57561

if (!Subtarget.hasSSE2())

57562

return CW_Invalid;

57563

break;

57564

}

57565

break;

57566

case 'v':

57567

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

57568

weight = CW_Register;

57569

[[fallthrough]];

57570

case 'x':

57571

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

57572

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

57573

weight = CW_Register;

57574

break;

57575

case 'k':

57576

// Enable conditional vector operations using %k<#> registers.

57577

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

57578

weight = CW_Register;

57579

break;

57580

case 'I':

57581

if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

57582

if (C->getZExtValue() <= 31)

57583

weight = CW_Constant;

57584

}

57585

break;

57586

case 'J':

57587

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57588

if (C->getZExtValue() <= 63)

57589

weight = CW_Constant;

57590

}

57591

break;

57592

case 'K':

57593

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57594

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

57595

weight = CW_Constant;

57596

}

57597

break;

57598

case 'L':

57599

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57600

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

57601

weight = CW_Constant;

57602

}

57603

break;

57604

case 'M':

57605

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57606

if (C->getZExtValue() <= 3)

57607

weight = CW_Constant;

57608

}

57609

break;

57610

case 'N':

57611

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57612

if (C->getZExtValue() <= 0xff)

57613

weight = CW_Constant;

57614

}

57615

break;

57616

case 'G':

57617

case 'C':

57618

if (isa<ConstantFP>(CallOperandVal)) {

57619

weight = CW_Constant;

57620

}

57621

break;

57622

case 'e':

57623

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57624

if ((C->getSExtValue() >= -0x80000000LL) &&

57625

(C->getSExtValue() <= 0x7fffffffLL))

57626

weight = CW_Constant;

57627

}

57628

break;

57629

case 'Z':

57630

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57631

if (C->getZExtValue() <= 0xffffffff)

57632

weight = CW_Constant;

57633

}

57634

break;

57635

}

57636

return weight;

57637

}

57638

57639

/// Try to replace an X constraint, which matches anything, with another that

57640

/// has more specific requirements based on the type of the corresponding

57641

/// operand.

57642

const char *X86TargetLowering::

57643

LowerXConstraint(EVT ConstraintVT) const {

57644

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

57645

// 'f' like normal targets.

57646

if (ConstraintVT.isFloatingPoint()) {

57647

if (Subtarget.hasSSE1())

57648

return "x";

57649

}

57650

57651

return TargetLowering::LowerXConstraint(ConstraintVT);

57652

}

57653

57654

// Lower @cc targets via setcc.

57655

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

57656

SDValue &Chain, SDValue &Flag, const SDLoc &DL,

57657

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

57658

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

57659

if (Cond == X86::COND_INVALID)

57660

return SDValue();

57661

// Check that return type is valid.

57662

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

57663

OpInfo.ConstraintVT.getSizeInBits() < 8)

57664

report_fatal_error("Flag output operand is of invalid type");

57665

57666

// Get EFLAGS register. Only update chain when copyfrom is glued.

57667

if (Flag.getNode()) {

57668

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);

57669

Chain = Flag.getValue(1);

57670

} else

57671

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

57672

// Extract CC code.

57673

SDValue CC = getSETCC(Cond, Flag, DL, DAG);

57674

// Extend to 32-bits

57675

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

57676

57677

return Result;

57678

}

57679

57680

/// Lower the specified operand into the Ops vector.

57681

/// If it is invalid, don't add anything to Ops.

57682

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

57683

std::string &Constraint,

57684

std::vector<SDValue>&Ops,

57685

SelectionDAG &DAG) const {

57686

SDValue Result;

57687

57688

// Only support length 1 constraints for now.

57689

if (Constraint.length() > 1) return;

57690

57691

char ConstraintLetter = Constraint[0];

57692

switch (ConstraintLetter) {

57693

default: break;

57694

case 'I':

57695

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57696

if (C->getZExtValue() <= 31) {

57697

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57698

Op.getValueType());

57699

break;

57700

}

57701

}

57702

return;

57703

case 'J':

57704

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57705

if (C->getZExtValue() <= 63) {

57706

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57707

Op.getValueType());

57708

break;

57709

}

57710

}

57711

return;

57712

case 'K':

57713

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57714

if (isInt<8>(C->getSExtValue())) {

57715

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57716

Op.getValueType());

57717

break;

57718

}

57719

}

57720

return;

57721

case 'L':

57722

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57723

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

57724

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

57725

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

57726

Op.getValueType());

57727

break;

57728

}

57729

}

57730

return;

57731

case 'M':

57732

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57733

if (C->getZExtValue() <= 3) {

57734

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57735

Op.getValueType());

57736

break;

57737

}

57738

}

57739

return;

57740

case 'N':

57741

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57742

if (C->getZExtValue() <= 255) {

57743

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57744

Op.getValueType());

57745

break;

57746

}

57747

}

57748

return;

57749

case 'O':

57750

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57751

if (C->getZExtValue() <= 127) {

57752

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57753

Op.getValueType());

57754

break;

57755

}

57756

}

57757

return;

57758

case 'e': {

57759

// 32-bit signed value

57760

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57761

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

57762

C->getSExtValue())) {

57763

// Widen to 64 bits here to get it sign extended.

57764

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

57765

break;

57766

}

57767

// FIXME gcc accepts some relocatable values here too, but only in certain

57768

// memory models; it's complicated.

57769

}

57770

return;

57771

}

57772

case 'Z': {

57773

// 32-bit unsigned value

57774

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57775

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

57776

C->getZExtValue())) {

57777

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57778

Op.getValueType());

57779

break;

57780

}

57781

}

57782

// FIXME gcc accepts some relocatable values here too, but only in certain

57783

// memory models; it's complicated.

57784

return;

57785

}

57786

case 'i': {

57787

// Literal immediates are always ok.

57788

if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {

57789

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

57790

BooleanContent BCont = getBooleanContents(MVT::i64);

57791

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

57792

: ISD::SIGN_EXTEND;

57793

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

57794

: CST->getSExtValue();

57795

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

57796

break;

57797

}

57798

57799

// In any sort of PIC mode addresses need to be computed at runtime by

57800

// adding in a register or some sort of table lookup. These can't

57801

// be used as immediates. BlockAddresses and BasicBlocks are fine though.

57802

if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

57803

!(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))

57804

return;

57805

57806

// If we are in non-pic codegen mode, we allow the address of a global (with

57807

// an optional displacement) to be used with 'i'.

57808

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

57809

// If we require an extra load to get this address, as in PIC mode, we

57810

// can't accept it.

57811

if (isGlobalStubReference(

57812

Subtarget.classifyGlobalReference(GA->getGlobal())))

57813

return;

57814

break;

57815

}

57816

}

57817

57818

if (Result.getNode()) {

57819

Ops.push_back(Result);

57820

return;

57821

}

57822

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

57823

}

57824

57825

/// Check if \p RC is a general purpose register class.

57826

/// I.e., GR* or one of their variant.

57827

static bool isGRClass(const TargetRegisterClass &RC) {

57828

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

57829

RC.hasSuperClassEq(&X86::GR16RegClass) ||

57830

RC.hasSuperClassEq(&X86::GR32RegClass) ||

57831

RC.hasSuperClassEq(&X86::GR64RegClass) ||

57832

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

57833

}

57834

57835

/// Check if \p RC is a vector register class.

57836

/// I.e., FR* / VR* or one of their variant.

57837

static bool isFRClass(const TargetRegisterClass &RC) {

57838

return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

57839

RC.hasSuperClassEq(&X86::FR32XRegClass) ||

57840

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

57841

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

57842

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

57843

RC.hasSuperClassEq(&X86::VR512RegClass);

57844

}

57845

57846

/// Check if \p RC is a mask register class.

57847

/// I.e., VK* or one of their variant.

57848

static bool isVKClass(const TargetRegisterClass &RC) {

57849

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

57850

RC.hasSuperClassEq(&X86::VK2RegClass) ||

57851

RC.hasSuperClassEq(&X86::VK4RegClass) ||

57852

RC.hasSuperClassEq(&X86::VK8RegClass) ||

57853

RC.hasSuperClassEq(&X86::VK16RegClass) ||

57854

RC.hasSuperClassEq(&X86::VK32RegClass) ||

57855

RC.hasSuperClassEq(&X86::VK64RegClass);

57856

}

57857

57858

std::pair<unsigned, const TargetRegisterClass *>

57859

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

57860

StringRef Constraint,

57861

MVT VT) const {

57862

// First, see if this is a constraint that directly corresponds to an LLVM

57863

// register class.

57864

if (Constraint.size() == 1) {

57865

// GCC Constraint Letters

57866

switch (Constraint[0]) {

57867

default: break;

57868

// 'A' means [ER]AX + [ER]DX.

57869

case 'A':

57870

if (Subtarget.is64Bit())

57871

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

57872

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57873, __extension__
__PRETTY_FUNCTION__))

57873

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57873, __extension__
__PRETTY_FUNCTION__));

57874

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

57875

57876

// TODO: Slight differences here in allocation order and leaving

57877

// RIP in the class. Do they matter any more here than they do

57878

// in the normal allocation?

57879

case 'k':

57880

if (Subtarget.hasAVX512()) {

57881

if (VT == MVT::i1)

57882

return std::make_pair(0U, &X86::VK1RegClass);

57883

if (VT == MVT::i8)

57884

return std::make_pair(0U, &X86::VK8RegClass);

57885

if (VT == MVT::i16)

57886

return std::make_pair(0U, &X86::VK16RegClass);

57887

}

57888

if (Subtarget.hasBWI()) {

57889

if (VT == MVT::i32)

57890

return std::make_pair(0U, &X86::VK32RegClass);

57891

if (VT == MVT::i64)

57892

return std::make_pair(0U, &X86::VK64RegClass);

57893

}

57894

break;

57895

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

57896

if (Subtarget.is64Bit()) {

57897

if (VT == MVT::i8 || VT == MVT::i1)

57898

return std::make_pair(0U, &X86::GR8RegClass);

57899

if (VT == MVT::i16)

57900

return std::make_pair(0U, &X86::GR16RegClass);

57901

if (VT == MVT::i32 || VT == MVT::f32)

57902

return std::make_pair(0U, &X86::GR32RegClass);

57903

if (VT != MVT::f80 && !VT.isVector())

57904

return std::make_pair(0U, &X86::GR64RegClass);

57905

break;

57906

}

57907

[[fallthrough]];

57908

// 32-bit fallthrough

57909

case 'Q': // Q_REGS

57910

if (VT == MVT::i8 || VT == MVT::i1)

57911

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

57912

if (VT == MVT::i16)

57913

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

57914

if (VT == MVT::i32 || VT == MVT::f32 ||

57915

(!VT.isVector() && !Subtarget.is64Bit()))

57916

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

57917

if (VT != MVT::f80 && !VT.isVector())

57918

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

57919

break;

57920

case 'r': // GENERAL_REGS

57921

case 'l': // INDEX_REGS

57922

if (VT == MVT::i8 || VT == MVT::i1)

57923

return std::make_pair(0U, &X86::GR8RegClass);

57924

if (VT == MVT::i16)

57925

return std::make_pair(0U, &X86::GR16RegClass);

57926

if (VT == MVT::i32 || VT == MVT::f32 ||

57927

(!VT.isVector() && !Subtarget.is64Bit()))

57928

return std::make_pair(0U, &X86::GR32RegClass);

57929

if (VT != MVT::f80 && !VT.isVector())

57930

return std::make_pair(0U, &X86::GR64RegClass);

57931

break;

57932

case 'R': // LEGACY_REGS

57933

if (VT == MVT::i8 || VT == MVT::i1)

57934

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

57935

if (VT == MVT::i16)

57936

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

57937

if (VT == MVT::i32 || VT == MVT::f32 ||

57938

(!VT.isVector() && !Subtarget.is64Bit()))

57939

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

57940

if (VT != MVT::f80 && !VT.isVector())

57941

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

57942

break;

57943

case 'f': // FP Stack registers.

57944

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

57945

// value to the correct fpstack register class.

57946

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

57947

return std::make_pair(0U, &X86::RFP32RegClass);

57948

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

57949

return std::make_pair(0U, &X86::RFP64RegClass);

57950

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

57951

return std::make_pair(0U, &X86::RFP80RegClass);

57952

break;

57953

case 'y': // MMX_REGS if MMX allowed.

57954

if (!Subtarget.hasMMX()) break;

57955

return std::make_pair(0U, &X86::VR64RegClass);

57956

case 'v':

57957

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

57958

if (!Subtarget.hasSSE1()) break;

57959

bool VConstraint = (Constraint[0] == 'v');

57960

57961

switch (VT.SimpleTy) {

57962

default: break;

57963

// Scalar SSE types.

57964

case MVT::f16:

57965

if (VConstraint && Subtarget.hasFP16())

57966

return std::make_pair(0U, &X86::FR16XRegClass);

57967

break;

57968

case MVT::f32:

57969

case MVT::i32:

57970

if (VConstraint && Subtarget.hasVLX())

57971

return std::make_pair(0U, &X86::FR32XRegClass);

57972

return std::make_pair(0U, &X86::FR32RegClass);

57973

case MVT::f64:

57974

case MVT::i64:

57975

if (VConstraint && Subtarget.hasVLX())

57976

return std::make_pair(0U, &X86::FR64XRegClass);

57977

return std::make_pair(0U, &X86::FR64RegClass);

57978

case MVT::i128:

57979

if (Subtarget.is64Bit()) {

57980

if (VConstraint && Subtarget.hasVLX())

57981

return std::make_pair(0U, &X86::VR128XRegClass);

57982

return std::make_pair(0U, &X86::VR128RegClass);

57983

}

57984

break;

57985

// Vector types and fp128.

57986

case MVT::v8f16:

57987

if (!Subtarget.hasFP16())

57988

break;

57989

[[fallthrough]];

57990

case MVT::f128:

57991

case MVT::v16i8:

57992

case MVT::v8i16:

57993

case MVT::v4i32:

57994

case MVT::v2i64:

57995

case MVT::v4f32:

57996

case MVT::v2f64:

57997

if (VConstraint && Subtarget.hasVLX())

57998

return std::make_pair(0U, &X86::VR128XRegClass);

57999

return std::make_pair(0U, &X86::VR128RegClass);

58000

// AVX types.

58001

case MVT::v16f16:

58002

if (!Subtarget.hasFP16())

58003

break;

58004

[[fallthrough]];

58005

case MVT::v32i8:

58006

case MVT::v16i16:

58007

case MVT::v8i32:

58008

case MVT::v4i64:

58009

case MVT::v8f32:

58010

case MVT::v4f64:

58011

if (VConstraint && Subtarget.hasVLX())

58012

return std::make_pair(0U, &X86::VR256XRegClass);

58013

if (Subtarget.hasAVX())

58014

return std::make_pair(0U, &X86::VR256RegClass);

58015

break;

58016

case MVT::v32f16:

58017

if (!Subtarget.hasFP16())

58018

break;

58019

[[fallthrough]];

58020

case MVT::v64i8:

58021

case MVT::v32i16:

58022

case MVT::v8f64:

58023

case MVT::v16f32:

58024

case MVT::v16i32:

58025

case MVT::v8i64:

58026

if (!Subtarget.hasAVX512()) break;

58027

if (VConstraint)

58028

return std::make_pair(0U, &X86::VR512RegClass);

58029

return std::make_pair(0U, &X86::VR512_0_15RegClass);

58030

}

58031

break;

58032

}

58033

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

58034

switch (Constraint[1]) {

58035

default:

58036

break;

58037

case 'i':

58038

case 't':

58039

case '2':

58040

return getRegForInlineAsmConstraint(TRI, "x", VT);

58041

case 'm':

58042

if (!Subtarget.hasMMX()) break;

58043

return std::make_pair(0U, &X86::VR64RegClass);

58044

case 'z':

58045

if (!Subtarget.hasSSE1()) break;

58046

switch (VT.SimpleTy) {

58047

default: break;

58048

// Scalar SSE types.

58049

case MVT::f16:

58050

if (!Subtarget.hasFP16())

58051

break;

58052

return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

58053

case MVT::f32:

58054

case MVT::i32:

58055

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

58056

case MVT::f64:

58057

case MVT::i64:

58058

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

58059

case MVT::v8f16:

58060

if (!Subtarget.hasFP16())

58061

break;

58062

[[fallthrough]];

58063

case MVT::f128:

58064

case MVT::v16i8:

58065

case MVT::v8i16:

58066

case MVT::v4i32:

58067

case MVT::v2i64:

58068

case MVT::v4f32:

58069

case MVT::v2f64:

58070

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

58071

// AVX types.

58072

case MVT::v16f16:

58073

if (!Subtarget.hasFP16())

58074

break;

58075

[[fallthrough]];

58076

case MVT::v32i8:

58077

case MVT::v16i16:

58078

case MVT::v8i32:

58079

case MVT::v4i64:

58080

case MVT::v8f32:

58081

case MVT::v4f64:

58082

if (Subtarget.hasAVX())

58083

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

58084

break;

58085

case MVT::v32f16:

58086

if (!Subtarget.hasFP16())

58087

break;

58088

[[fallthrough]];

58089

case MVT::v64i8:

58090

case MVT::v32i16:

58091

case MVT::v8f64:

58092

case MVT::v16f32:

58093

case MVT::v16i32:

58094

case MVT::v8i64:

58095

if (Subtarget.hasAVX512())

58096

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

58097

break;

58098

}

58099

break;

58100

case 'k':

58101

// This register class doesn't allocate k0 for masked vector operation.

58102

if (Subtarget.hasAVX512()) {

58103

if (VT == MVT::i1)

58104

return std::make_pair(0U, &X86::VK1WMRegClass);

58105

if (VT == MVT::i8)

58106

return std::make_pair(0U, &X86::VK8WMRegClass);

58107

if (VT == MVT::i16)

58108

return std::make_pair(0U, &X86::VK16WMRegClass);

58109

}

58110

if (Subtarget.hasBWI()) {

58111

if (VT == MVT::i32)

58112

return std::make_pair(0U, &X86::VK32WMRegClass);

58113

if (VT == MVT::i64)

58114

return std::make_pair(0U, &X86::VK64WMRegClass);

58115

}

58116

break;

58117

}

58118

}

58119

58120

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

58121

return std::make_pair(0U, &X86::GR32RegClass);

58122

58123

// Use the default implementation in TargetLowering to convert the register

58124

// constraint into a member of a register class.

58125

std::pair<Register, const TargetRegisterClass*> Res;

58126

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

58127

58128

// Not found as a standard register?

58129

if (!Res.second) {

58130

// Only match x87 registers if the VT is one SelectionDAGBuilder can convert

58131

// to/from f80.

58132

if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

58133

// Map st(0) -> st(7) -> ST0

58134

if (Constraint.size() == 7 && Constraint[0] == '{' &&

58135

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

58136

Constraint[3] == '(' &&

58137

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

58138

Constraint[5] == ')' && Constraint[6] == '}') {

58139

// st(7) is not allocatable and thus not a member of RFP80. Return

58140

// singleton class in cases where we have a reference to it.

58141

if (Constraint[4] == '7')

58142

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

58143

return std::make_pair(X86::FP0 + Constraint[4] - '0',

58144

&X86::RFP80RegClass);

58145

}

58146

58147

// GCC allows "st(0)" to be called just plain "st".

58148

if (StringRef("{st}").equals_insensitive(Constraint))

58149

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

58150

}

58151

58152

// flags -> EFLAGS

58153

if (StringRef("{flags}").equals_insensitive(Constraint))

58154

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

58155

58156

// dirflag -> DF

58157

// Only allow for clobber.

58158

if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

58159

VT == MVT::Other)

58160

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

58161

58162

// fpsr -> FPSW

58163

if (StringRef("{fpsr}").equals_insensitive(Constraint))

58164

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

58165

58166

return Res;

58167

}

58168

58169

// Make sure it isn't a register that requires 64-bit mode.

58170

if (!Subtarget.is64Bit() &&

58171

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

58172

TRI->getEncodingValue(Res.first) >= 8) {

58173

// Register requires REX prefix, but we're in 32-bit mode.

58174

return std::make_pair(0, nullptr);

58175

}

58176

58177

// Make sure it isn't a register that requires AVX512.

58178

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

58179

TRI->getEncodingValue(Res.first) & 0x10) {

58180

// Register requires EVEX prefix.

58181

return std::make_pair(0, nullptr);

58182

}

58183

58184

// Otherwise, check to see if this is a register class of the wrong value

58185

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

58186

// turn into {ax},{dx}.

58187

// MVT::Other is used to specify clobber names.

58188

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

58189

return Res; // Correct type already, nothing to do.

58190

58191

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

58192

// return "eax". This should even work for things like getting 64bit integer

58193

// registers when given an f64 type.

58194

const TargetRegisterClass *Class = Res.second;

58195

// The generic code will match the first register class that contains the

58196

// given register. Thus, based on the ordering of the tablegened file,

58197

// the "plain" GR classes might not come first.

58198

// Therefore, use a helper method.

58199

if (isGRClass(*Class)) {

58200

unsigned Size = VT.getSizeInBits();

58201

if (Size == 1) Size = 8;

58202

if (Size != 8 && Size != 16 && Size != 32 && Size != 64)

58203

return std::make_pair(0, nullptr);

58204

Register DestReg = getX86SubSuperRegister(Res.first, Size);

58205

if (DestReg.isValid()) {

58206

bool is64Bit = Subtarget.is64Bit();

58207

const TargetRegisterClass *RC =

58208

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

58209

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

58210

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

58211

: /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);

58212

if (Size == 64 && !is64Bit) {

58213

// Model GCC's behavior here and select a fixed pair of 32-bit

58214

// registers.

58215

switch (DestReg) {

58216

case X86::RAX:

58217

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

58218

case X86::RDX:

58219

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

58220

case X86::RCX:

58221

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

58222

case X86::RBX:

58223

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

58224

case X86::RSI:

58225

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

58226

case X86::RDI:

58227

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

58228

case X86::RBP:

58229

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

58230

default:

58231

return std::make_pair(0, nullptr);

58232

}

58233

}

58234

if (RC && RC->contains(DestReg))

58235

return std::make_pair(DestReg, RC);

58236

return Res;

58237

}

58238

// No register found/type mismatch.

58239

return std::make_pair(0, nullptr);

58240

} else if (isFRClass(*Class)) {

58241

// Handle references to XMM physical registers that got mapped into the

58242

// wrong class. This can happen with constraints like {xmm0} where the

58243

// target independent register mapper will just pick the first match it can

58244

// find, ignoring the required type.

58245

58246

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

58247

if (VT == MVT::f16)

58248

Res.second = &X86::FR16XRegClass;

58249

else if (VT == MVT::f32 || VT == MVT::i32)

58250

Res.second = &X86::FR32XRegClass;

58251

else if (VT == MVT::f64 || VT == MVT::i64)

58252

Res.second = &X86::FR64XRegClass;

58253

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

58254

Res.second = &X86::VR128XRegClass;

58255

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

58256

Res.second = &X86::VR256XRegClass;

58257

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

58258

Res.second = &X86::VR512RegClass;

58259

else {

58260

// Type mismatch and not a clobber: Return an error;

58261

Res.first = 0;

58262

Res.second = nullptr;

58263

}

58264

} else if (isVKClass(*Class)) {

58265

if (VT == MVT::i1)

58266

Res.second = &X86::VK1RegClass;

58267

else if (VT == MVT::i8)

58268

Res.second = &X86::VK8RegClass;

58269

else if (VT == MVT::i16)

58270

Res.second = &X86::VK16RegClass;

58271

else if (VT == MVT::i32)

58272

Res.second = &X86::VK32RegClass;

58273

else if (VT == MVT::i64)

58274

Res.second = &X86::VK64RegClass;

58275

else {

58276

// Type mismatch and not a clobber: Return an error;

58277

Res.first = 0;

58278

Res.second = nullptr;

58279

}

58280

}

58281

58282

return Res;

58283

}

58284

58285

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

58286

// Integer division on x86 is expensive. However, when aggressively optimizing

58287

// for code size, we prefer to use a div instruction, as it is usually smaller

58288

// than the alternative sequence.

58289

// The exception to this is vector division. Since x86 doesn't have vector

58290

// integer division, leaving the division as-is is a loss even in terms of

58291

// size, because it will have to be scalarized, while the alternative code

58292

// sequence can be performed in vector form.

58293

bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

58294

return OptSize && !VT.isVector();

58295

}

58296

58297

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

58298

if (!Subtarget.is64Bit())

58299

return;

58300

58301

// Update IsSplitCSR in X86MachineFunctionInfo.

58302

X86MachineFunctionInfo *AFI =

58303

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

58304

AFI->setIsSplitCSR(true);

58305

}

58306

58307

void X86TargetLowering::insertCopiesSplitCSR(

58308

MachineBasicBlock *Entry,

58309

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

58310

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

58311

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

58312

if (!IStart)

58313

return;

58314

58315

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

58316

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

58317

MachineBasicBlock::iterator MBBI = Entry->begin();

58318

for (const MCPhysReg *I = IStart; *I; ++I) {

58319

const TargetRegisterClass *RC = nullptr;

58320

if (X86::GR64RegClass.contains(*I))

58321

RC = &X86::GR64RegClass;

58322

else

58323

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58323);

58324

58325

Register NewVR = MRI->createVirtualRegister(RC);

58326

// Create copy from CSR to a virtual register.

58327

// FIXME: this currently does not emit CFI pseudo-instructions, it works

58328

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

58329

// nounwind. If we want to generalize this later, we may need to emit

58330

// CFI pseudo-instructions.

58331

assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58333, __extension__
__PRETTY_FUNCTION__))

58332

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58333, __extension__
__PRETTY_FUNCTION__))

58333

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58333, __extension__
__PRETTY_FUNCTION__));

58334

Entry->addLiveIn(*I);

58335

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

58336

.addReg(*I);

58337

58338

// Insert the copy-back instructions right before the terminator.

58339

for (auto *Exit : Exits)

58340

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

58341

TII->get(TargetOpcode::COPY), *I)

58342

.addReg(NewVR);

58343

}

58344

}

58345

58346

bool X86TargetLowering::supportSwiftError() const {

58347

return Subtarget.is64Bit();

58348

}

58349

58350

/// Returns true if stack probing through a function call is requested.

58351

bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {

58352

return !getStackProbeSymbolName(MF).empty();

58353

}

58354

58355

/// Returns true if stack probing through inline assembly is requested.

58356

bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {

58357

58358

// No inline stack probe for Windows, they have their own mechanism.

58359

if (Subtarget.isOSWindows() ||

58360

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

58361

return false;

58362

58363

// If the function specifically requests inline stack probes, emit them.

58364

if (MF.getFunction().hasFnAttribute("probe-stack"))

58365

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

58366

"inline-asm";

58367

58368

return false;

58369

}

58370

58371

/// Returns the name of the symbol used to emit stack probes or the empty

58372

/// string if not applicable.

58373

StringRef

58374

X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {

58375

// Inline Stack probes disable stack probe call

58376

if (hasInlineStackProbe(MF))

58377

return "";

58378

58379

// If the function specifically requests stack probes, emit them.

58380

if (MF.getFunction().hasFnAttribute("probe-stack"))

58381

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

58382

58383

// Generally, if we aren't on Windows, the platform ABI does not include

58384

// support for stack probes, so don't emit them.

58385

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

58386

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

58387

return "";

58388

58389

// We need a stack probe to conform to the Windows ABI. Choose the right

58390

// symbol.

58391

if (Subtarget.is64Bit())

58392

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

58393

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

58394

}

58395

58396

unsigned

58397

X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {

58398

// The default stack probe size is 4096 if the function has no stackprobesize

58399

// attribute.

58400

return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",

58401

4096);

58402

}

58403

58404

Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

58405

if (ML->isInnermost() &&

58406

ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

58407

return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

58408

return TargetLowering::getPrefLoopAlignment();

58409

}

File:	build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 17214, column 21 The result of the '/' expression is undefined

Bug Summary

Annotated Source Code